xinference 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (317) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +11 -28
  3. xinference/client/restful/async_restful_client.py +20 -3
  4. xinference/client/restful/restful_client.py +20 -3
  5. xinference/core/supervisor.py +87 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +38 -1
  11. xinference/model/image/model_spec.json +69 -0
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +4 -0
  14. xinference/model/llm/llm_family.json +464 -2
  15. xinference/model/llm/sglang/core.py +30 -11
  16. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  17. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  18. xinference/model/llm/utils.py +12 -9
  19. xinference/model/llm/vllm/core.py +93 -17
  20. xinference/thirdparty/audiotools/__init__.py +10 -0
  21. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  22. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  23. xinference/thirdparty/audiotools/core/display.py +194 -0
  24. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  25. xinference/thirdparty/audiotools/core/effects.py +647 -0
  26. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  27. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  28. xinference/thirdparty/audiotools/core/playback.py +252 -0
  29. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  30. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  31. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  32. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  33. xinference/thirdparty/audiotools/core/util.py +671 -0
  34. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  35. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  36. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  37. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  38. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  39. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  40. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  41. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  42. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  43. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  44. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  45. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  46. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  47. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  48. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  49. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  50. xinference/thirdparty/audiotools/post.py +140 -0
  51. xinference/thirdparty/audiotools/preference.py +600 -0
  52. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  53. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  54. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  55. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  56. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  57. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  58. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  59. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  60. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  61. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  62. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  63. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  72. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  73. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  74. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  75. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  76. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  77. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  78. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  79. xinference/thirdparty/indextts/__init__.py +0 -0
  80. xinference/thirdparty/indextts/cli.py +65 -0
  81. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  82. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  83. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  84. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  85. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  86. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  87. xinference/thirdparty/indextts/gpt/model.py +713 -0
  88. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  89. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  90. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  91. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  92. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  93. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  94. xinference/thirdparty/indextts/infer.py +690 -0
  95. xinference/thirdparty/indextts/infer_v2.py +739 -0
  96. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  97. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  98. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  99. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  100. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  101. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  102. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  103. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  104. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  105. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  106. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  107. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  108. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  109. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  110. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  111. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  112. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  113. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  114. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  115. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  116. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  117. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  118. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  119. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  120. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  121. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  123. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  124. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  133. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  134. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  135. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  136. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  137. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  138. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  139. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  140. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  141. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  142. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  143. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  144. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  145. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  146. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  147. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  148. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  149. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  150. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  151. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  152. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  153. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  154. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  155. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  159. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  160. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  161. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  162. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  163. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  164. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  165. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  166. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  167. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  168. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  169. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  170. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  171. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  172. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  173. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  174. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  175. xinference/thirdparty/indextts/utils/common.py +121 -0
  176. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  177. xinference/thirdparty/indextts/utils/front.py +536 -0
  178. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  179. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  180. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  181. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  182. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  183. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  184. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  185. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  186. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  240. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  241. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  242. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  243. xinference/thirdparty/indextts/utils/utils.py +93 -0
  244. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  245. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  246. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  247. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  248. xinference/ui/gradio/media_interface.py +66 -8
  249. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  250. xinference/ui/web/ui/build/index.html +1 -1
  251. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  252. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  253. xinference/ui/web/ui/build/static/js/main.d192c4f3.js +3 -0
  254. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.d192c4f3.js.LICENSE.txt} +0 -7
  255. xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +1 -0
  256. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  257. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  258. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  259. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  260. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  261. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  262. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  263. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  264. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  265. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  266. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +1 -0
  273. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  274. xinference/ui/web/ui/package-lock.json +0 -34
  275. xinference/ui/web/ui/package.json +0 -1
  276. xinference/ui/web/ui/src/locales/en.json +9 -3
  277. xinference/ui/web/ui/src/locales/ja.json +9 -3
  278. xinference/ui/web/ui/src/locales/ko.json +9 -3
  279. xinference/ui/web/ui/src/locales/zh.json +9 -3
  280. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/METADATA +18 -2
  281. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/RECORD +285 -67
  282. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  283. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  284. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  285. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  286. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  287. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  288. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  289. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  290. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  291. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  292. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  293. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  294. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  295. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  296. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  302. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  303. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  304. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  305. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  306. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  307. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  308. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  309. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  310. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  311. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  312. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  313. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  314. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/WHEEL +0 -0
  315. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/entry_points.txt +0 -0
  316. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/licenses/LICENSE +0 -0
  317. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Tuple, Union, Optional
4
+
5
+ import torch
6
+ import yaml
7
+ from torch import nn
8
+ from .heads import ISTFTHead
9
+ from .models import VocosBackbone
10
+
11
+
12
+ class Vocos(nn.Module):
13
+ """
14
+ The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
15
+ This class is primarily designed for inference, with support for loading from pretrained
16
+ model checkpoints. It consists of three main components: a feature extractor,
17
+ a backbone, and a head.
18
+ """
19
+
20
+ def __init__(
21
+ self, args,
22
+ ):
23
+ super().__init__()
24
+ self.backbone = VocosBackbone(
25
+ input_channels=args.vocos.backbone.input_channels,
26
+ dim=args.vocos.backbone.dim,
27
+ intermediate_dim=args.vocos.backbone.intermediate_dim,
28
+ num_layers=args.vocos.backbone.num_layers,
29
+ )
30
+ self.head = ISTFTHead(
31
+ dim=args.vocos.head.dim,
32
+ n_fft=args.vocos.head.n_fft,
33
+ hop_length=args.vocos.head.hop_length,
34
+ padding=args.vocos.head.padding,
35
+ )
36
+
37
+ def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
38
+ """
39
+ Method to decode audio waveform from already calculated features. The features input is passed through
40
+ the backbone and the head to reconstruct the audio output.
41
+
42
+ Args:
43
+ features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
44
+ C denotes the feature dimension, and L is the sequence length.
45
+
46
+ Returns:
47
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
48
+ """
49
+ x = self.backbone(features_input, **kwargs)
50
+ audio_output = self.head(x)
51
+ return audio_output
@@ -0,0 +1,192 @@
1
+ import numpy as np
2
+ import scipy
3
+ import torch
4
+ from torch import nn, view_as_real, view_as_complex
5
+
6
+
7
+ class ISTFT(nn.Module):
8
+ """
9
+ Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
10
+ windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
11
+ See issue: https://github.com/pytorch/pytorch/issues/62323
12
+ Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
13
+ The NOLA constraint is met as we trim padded samples anyway.
14
+
15
+ Args:
16
+ n_fft (int): Size of Fourier transform.
17
+ hop_length (int): The distance between neighboring sliding window frames.
18
+ win_length (int): The size of window frame and STFT filter.
19
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
20
+ """
21
+
22
+ def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
23
+ super().__init__()
24
+ if padding not in ["center", "same"]:
25
+ raise ValueError("Padding must be 'center' or 'same'.")
26
+ self.padding = padding
27
+ self.n_fft = n_fft
28
+ self.hop_length = hop_length
29
+ self.win_length = win_length
30
+ window = torch.hann_window(win_length)
31
+ self.register_buffer("window", window)
32
+
33
+ def forward(self, spec: torch.Tensor) -> torch.Tensor:
34
+ """
35
+ Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
36
+
37
+ Args:
38
+ spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
39
+ N is the number of frequency bins, and T is the number of time frames.
40
+
41
+ Returns:
42
+ Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
43
+ """
44
+ if self.padding == "center":
45
+ # Fallback to pytorch native implementation
46
+ return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
47
+ elif self.padding == "same":
48
+ pad = (self.win_length - self.hop_length) // 2
49
+ else:
50
+ raise ValueError("Padding must be 'center' or 'same'.")
51
+
52
+ assert spec.dim() == 3, "Expected a 3D tensor as input"
53
+ B, N, T = spec.shape
54
+
55
+ # Inverse FFT
56
+ ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
57
+ ifft = ifft * self.window[None, :, None]
58
+
59
+ # Overlap and Add
60
+ output_size = (T - 1) * self.hop_length + self.win_length
61
+ y = torch.nn.functional.fold(
62
+ ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
63
+ )[:, 0, 0, pad:-pad]
64
+
65
+ # Window envelope
66
+ window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
67
+ window_envelope = torch.nn.functional.fold(
68
+ window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
69
+ ).squeeze()[pad:-pad]
70
+
71
+ # Normalize
72
+ assert (window_envelope > 1e-11).all()
73
+ y = y / window_envelope
74
+
75
+ return y
76
+
77
+
78
+ class MDCT(nn.Module):
79
+ """
80
+ Modified Discrete Cosine Transform (MDCT) module.
81
+
82
+ Args:
83
+ frame_len (int): Length of the MDCT frame.
84
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
85
+ """
86
+
87
+ def __init__(self, frame_len: int, padding: str = "same"):
88
+ super().__init__()
89
+ if padding not in ["center", "same"]:
90
+ raise ValueError("Padding must be 'center' or 'same'.")
91
+ self.padding = padding
92
+ self.frame_len = frame_len
93
+ N = frame_len // 2
94
+ n0 = (N + 1) / 2
95
+ window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
96
+ self.register_buffer("window", window)
97
+
98
+ pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len)
99
+ post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N)
100
+ # view_as_real: NCCL Backend does not support ComplexFloat data type
101
+ # https://github.com/pytorch/pytorch/issues/71613
102
+ self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
103
+ self.register_buffer("post_twiddle", view_as_real(post_twiddle))
104
+
105
+ def forward(self, audio: torch.Tensor) -> torch.Tensor:
106
+ """
107
+ Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.
108
+
109
+ Args:
110
+ audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
111
+ and T is the length of the audio.
112
+
113
+ Returns:
114
+ Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
115
+ and N is the number of frequency bins.
116
+ """
117
+ if self.padding == "center":
118
+ audio = torch.nn.functional.pad(audio, (self.frame_len // 2, self.frame_len // 2))
119
+ elif self.padding == "same":
120
+ # hop_length is 1/2 frame_len
121
+ audio = torch.nn.functional.pad(audio, (self.frame_len // 4, self.frame_len // 4))
122
+ else:
123
+ raise ValueError("Padding must be 'center' or 'same'.")
124
+
125
+ x = audio.unfold(-1, self.frame_len, self.frame_len // 2)
126
+ N = self.frame_len // 2
127
+ x = x * self.window.expand(x.shape)
128
+ X = torch.fft.fft(x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1)[..., :N]
129
+ res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N)
130
+ return torch.real(res) * np.sqrt(2)
131
+
132
+
133
+ class IMDCT(nn.Module):
134
+ """
135
+ Inverse Modified Discrete Cosine Transform (IMDCT) module.
136
+
137
+ Args:
138
+ frame_len (int): Length of the MDCT frame.
139
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
140
+ """
141
+
142
+ def __init__(self, frame_len: int, padding: str = "same"):
143
+ super().__init__()
144
+ if padding not in ["center", "same"]:
145
+ raise ValueError("Padding must be 'center' or 'same'.")
146
+ self.padding = padding
147
+ self.frame_len = frame_len
148
+ N = frame_len // 2
149
+ n0 = (N + 1) / 2
150
+ window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
151
+ self.register_buffer("window", window)
152
+
153
+ pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N)
154
+ post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2))
155
+ self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
156
+ self.register_buffer("post_twiddle", view_as_real(post_twiddle))
157
+
158
+ def forward(self, X: torch.Tensor) -> torch.Tensor:
159
+ """
160
+ Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.
161
+
162
+ Args:
163
+ X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
164
+ L is the number of frames, and N is the number of frequency bins.
165
+
166
+ Returns:
167
+ Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
168
+ """
169
+ B, L, N = X.shape
170
+ Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device)
171
+ Y[..., :N] = X
172
+ Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,)))
173
+ y = torch.fft.ifft(Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1)
174
+ y = torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2)
175
+ result = y * self.window.expand(y.shape)
176
+ output_size = (1, (L + 1) * N)
177
+ audio = torch.nn.functional.fold(
178
+ result.transpose(1, 2),
179
+ output_size=output_size,
180
+ kernel_size=(1, self.frame_len),
181
+ stride=(1, self.frame_len // 2),
182
+ )[:, 0, 0, :]
183
+
184
+ if self.padding == "center":
185
+ pad = self.frame_len // 2
186
+ elif self.padding == "same":
187
+ pad = self.frame_len // 4
188
+ else:
189
+ raise ValueError("Padding must be 'center' or 'same'.")
190
+
191
+ audio = audio[:, pad:-pad]
192
+ return audio
@@ -0,0 +1,174 @@
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from indextts.s2mel.modules.encodec import SConv1d
7
+
8
+ from . import commons
9
+ LRELU_SLOPE = 0.1
10
+
11
+ class LayerNorm(nn.Module):
12
+ def __init__(self, channels, eps=1e-5):
13
+ super().__init__()
14
+ self.channels = channels
15
+ self.eps = eps
16
+
17
+ self.gamma = nn.Parameter(torch.ones(channels))
18
+ self.beta = nn.Parameter(torch.zeros(channels))
19
+
20
+ def forward(self, x):
21
+ x = x.transpose(1, -1)
22
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
23
+ return x.transpose(1, -1)
24
+
25
+
26
+ class ConvReluNorm(nn.Module):
27
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
28
+ super().__init__()
29
+ self.in_channels = in_channels
30
+ self.hidden_channels = hidden_channels
31
+ self.out_channels = out_channels
32
+ self.kernel_size = kernel_size
33
+ self.n_layers = n_layers
34
+ self.p_dropout = p_dropout
35
+ assert n_layers > 1, "Number of layers should be larger than 0."
36
+
37
+ self.conv_layers = nn.ModuleList()
38
+ self.norm_layers = nn.ModuleList()
39
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
40
+ self.norm_layers.append(LayerNorm(hidden_channels))
41
+ self.relu_drop = nn.Sequential(
42
+ nn.ReLU(),
43
+ nn.Dropout(p_dropout))
44
+ for _ in range(n_layers - 1):
45
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
46
+ self.norm_layers.append(LayerNorm(hidden_channels))
47
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
48
+ self.proj.weight.data.zero_()
49
+ self.proj.bias.data.zero_()
50
+
51
+ def forward(self, x, x_mask):
52
+ x_org = x
53
+ for i in range(self.n_layers):
54
+ x = self.conv_layers[i](x * x_mask)
55
+ x = self.norm_layers[i](x)
56
+ x = self.relu_drop(x)
57
+ x = x_org + self.proj(x)
58
+ return x * x_mask
59
+
60
+
61
+ class DDSConv(nn.Module):
62
+ """
63
+ Dialted and Depth-Separable Convolution
64
+ """
65
+
66
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
67
+ super().__init__()
68
+ self.channels = channels
69
+ self.kernel_size = kernel_size
70
+ self.n_layers = n_layers
71
+ self.p_dropout = p_dropout
72
+
73
+ self.drop = nn.Dropout(p_dropout)
74
+ self.convs_sep = nn.ModuleList()
75
+ self.convs_1x1 = nn.ModuleList()
76
+ self.norms_1 = nn.ModuleList()
77
+ self.norms_2 = nn.ModuleList()
78
+ for i in range(n_layers):
79
+ dilation = kernel_size ** i
80
+ padding = (kernel_size * dilation - dilation) // 2
81
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
82
+ groups=channels, dilation=dilation, padding=padding
83
+ ))
84
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
85
+ self.norms_1.append(LayerNorm(channels))
86
+ self.norms_2.append(LayerNorm(channels))
87
+
88
+ def forward(self, x, x_mask, g=None):
89
+ if g is not None:
90
+ x = x + g
91
+ for i in range(self.n_layers):
92
+ y = self.convs_sep[i](x * x_mask)
93
+ y = self.norms_1[i](y)
94
+ y = F.gelu(y)
95
+ y = self.convs_1x1[i](y)
96
+ y = self.norms_2[i](y)
97
+ y = F.gelu(y)
98
+ y = self.drop(y)
99
+ x = x + y
100
+ return x * x_mask
101
+
102
+
103
+ class WN(torch.nn.Module):
104
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0, causal=False):
105
+ super(WN, self).__init__()
106
+ conv1d_type = SConv1d
107
+ assert (kernel_size % 2 == 1)
108
+ self.hidden_channels = hidden_channels
109
+ self.kernel_size = kernel_size,
110
+ self.dilation_rate = dilation_rate
111
+ self.n_layers = n_layers
112
+ self.gin_channels = gin_channels
113
+ self.p_dropout = p_dropout
114
+
115
+ self.in_layers = torch.nn.ModuleList()
116
+ self.res_skip_layers = torch.nn.ModuleList()
117
+ self.drop = nn.Dropout(p_dropout)
118
+
119
+ if gin_channels != 0:
120
+ self.cond_layer = conv1d_type(gin_channels, 2 * hidden_channels * n_layers, 1, norm='weight_norm')
121
+
122
+ for i in range(n_layers):
123
+ dilation = dilation_rate ** i
124
+ padding = int((kernel_size * dilation - dilation) / 2)
125
+ in_layer = conv1d_type(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation,
126
+ padding=padding, norm='weight_norm', causal=causal)
127
+ self.in_layers.append(in_layer)
128
+
129
+ # last one is not necessary
130
+ if i < n_layers - 1:
131
+ res_skip_channels = 2 * hidden_channels
132
+ else:
133
+ res_skip_channels = hidden_channels
134
+
135
+ res_skip_layer = conv1d_type(hidden_channels, res_skip_channels, 1, norm='weight_norm', causal=causal)
136
+ self.res_skip_layers.append(res_skip_layer)
137
+
138
+ def forward(self, x, x_mask, g=None, **kwargs):
139
+ output = torch.zeros_like(x)
140
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
141
+
142
+ if g is not None:
143
+ g = self.cond_layer(g)
144
+
145
+ for i in range(self.n_layers):
146
+ x_in = self.in_layers[i](x)
147
+ if g is not None:
148
+ cond_offset = i * 2 * self.hidden_channels
149
+ g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
150
+ else:
151
+ g_l = torch.zeros_like(x_in)
152
+
153
+ acts = commons.fused_add_tanh_sigmoid_multiply(
154
+ x_in,
155
+ g_l,
156
+ n_channels_tensor)
157
+ acts = self.drop(acts)
158
+
159
+ res_skip_acts = self.res_skip_layers[i](acts)
160
+ if i < self.n_layers - 1:
161
+ res_acts = res_skip_acts[:, :self.hidden_channels, :]
162
+ x = (x + res_acts) * x_mask
163
+ output = output + res_skip_acts[:, self.hidden_channels:, :]
164
+ else:
165
+ output = output + res_skip_acts
166
+ return output * x_mask
167
+
168
+ def remove_weight_norm(self):
169
+ if self.gin_channels != 0:
170
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
171
+ for l in self.in_layers:
172
+ torch.nn.utils.remove_weight_norm(l)
173
+ for l in self.res_skip_layers:
174
+ torch.nn.utils.remove_weight_norm(l)
@@ -0,0 +1,96 @@
1
+ #coding:utf-8
2
+ import os, sys
3
+ import os.path as osp
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch.optim import Optimizer
8
+ from functools import reduce
9
+ from torch.optim import AdamW
10
+
11
+ class MultiOptimizer:
12
+ def __init__(self, optimizers={}, schedulers={}):
13
+ self.optimizers = optimizers
14
+ self.schedulers = schedulers
15
+ self.keys = list(optimizers.keys())
16
+ self.param_groups = reduce(lambda x,y: x+y, [v.param_groups for v in self.optimizers.values()])
17
+
18
+ def state_dict(self):
19
+ state_dicts = [(key, self.optimizers[key].state_dict())\
20
+ for key in self.keys]
21
+ return state_dicts
22
+
23
+ def scheduler_state_dict(self):
24
+ state_dicts = [(key, self.schedulers[key].state_dict())\
25
+ for key in self.keys]
26
+ return state_dicts
27
+
28
+ def load_state_dict(self, state_dict):
29
+ for key, val in state_dict:
30
+ try:
31
+ self.optimizers[key].load_state_dict(val)
32
+ except:
33
+ print("Unloaded %s" % key)
34
+
35
+ def load_scheduler_state_dict(self, state_dict):
36
+ for key, val in state_dict:
37
+ try:
38
+ self.schedulers[key].load_state_dict(val)
39
+ except:
40
+ print("Unloaded %s" % key)
41
+
42
+ def step(self, key=None, scaler=None):
43
+ keys = [key] if key is not None else self.keys
44
+ _ = [self._step(key, scaler) for key in keys]
45
+
46
+ def _step(self, key, scaler=None):
47
+ if scaler is not None:
48
+ scaler.step(self.optimizers[key])
49
+ scaler.update()
50
+ else:
51
+ self.optimizers[key].step()
52
+
53
+ def zero_grad(self, key=None):
54
+ if key is not None:
55
+ self.optimizers[key].zero_grad()
56
+ else:
57
+ _ = [self.optimizers[key].zero_grad() for key in self.keys]
58
+
59
+ def scheduler(self, *args, key=None):
60
+ if key is not None:
61
+ self.schedulers[key].step(*args)
62
+ else:
63
+ _ = [self.schedulers[key].step_batch(*args) for key in self.keys]
64
+
65
+ def define_scheduler(optimizer, params):
66
+ scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=params['gamma'])
67
+
68
+ return scheduler
69
+
70
+ def build_optimizer(model_dict, lr, type='AdamW'):
71
+ optim = {}
72
+ for key, model in model_dict.items():
73
+ model_parameters = model.parameters()
74
+ parameters_names = []
75
+ parameters_names.append(
76
+ [
77
+ name_param_pair[0]
78
+ for name_param_pair in model.named_parameters()
79
+ ]
80
+ )
81
+ if type == 'AdamW':
82
+ optim[key] = AdamW(
83
+ model_parameters,
84
+ lr=lr,
85
+ betas=(0.9, 0.98),
86
+ eps=1e-9,
87
+ weight_decay=0.1,
88
+ )
89
+ else:
90
+ raise ValueError('Unknown optimizer type: %s' % type)
91
+
92
+ schedulers = dict([(key, torch.optim.lr_scheduler.ExponentialLR(opt, gamma=0.999996))
93
+ for key, opt in optim.items()])
94
+
95
+ multi_optim = MultiOptimizer(optim, schedulers)
96
+ return multi_optim
@@ -0,0 +1,148 @@
1
+ from transformers import SeamlessM4TFeatureExtractor
2
+ from transformers import Wav2Vec2BertModel
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import numpy as np
7
+ import librosa
8
+ import os
9
+ import pickle
10
+ import math
11
+ import json
12
+ import safetensors
13
+ import json5
14
+ # from codec.kmeans.repcodec_model import RepCodec
15
+ from startts.examples.ftchar.models.codec.kmeans.repcodec_model import RepCodec
16
+
17
+ class JsonHParams:
18
+ def __init__(self, **kwargs):
19
+ for k, v in kwargs.items():
20
+ if type(v) == dict:
21
+ v = JsonHParams(**v)
22
+ self[k] = v
23
+
24
+ def keys(self):
25
+ return self.__dict__.keys()
26
+
27
+ def items(self):
28
+ return self.__dict__.items()
29
+
30
+ def values(self):
31
+ return self.__dict__.values()
32
+
33
+ def __len__(self):
34
+ return len(self.__dict__)
35
+
36
+ def __getitem__(self, key):
37
+ return getattr(self, key)
38
+
39
+ def __setitem__(self, key, value):
40
+ return setattr(self, key, value)
41
+
42
+ def __contains__(self, key):
43
+ return key in self.__dict__
44
+
45
+ def __repr__(self):
46
+ return self.__dict__.__repr__()
47
+
48
+
49
+ def _load_config(config_fn, lowercase=False):
50
+ """Load configurations into a dictionary
51
+
52
+ Args:
53
+ config_fn (str): path to configuration file
54
+ lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
55
+
56
+ Returns:
57
+ dict: dictionary that stores configurations
58
+ """
59
+ with open(config_fn, "r") as f:
60
+ data = f.read()
61
+ config_ = json5.loads(data)
62
+ if "base_config" in config_:
63
+ # load configurations from new path
64
+ p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
65
+ p_config_ = _load_config(p_config_path)
66
+ config_ = override_config(p_config_, config_)
67
+ if lowercase:
68
+ # change keys in config_ to lower case
69
+ config_ = get_lowercase_keys_config(config_)
70
+ return config_
71
+
72
+
73
+ def load_config(config_fn, lowercase=False):
74
+ """Load configurations into a dictionary
75
+
76
+ Args:
77
+ config_fn (str): path to configuration file
78
+ lowercase (bool, optional): _description_. Defaults to False.
79
+
80
+ Returns:
81
+ JsonHParams: an object that stores configurations
82
+ """
83
+ config_ = _load_config(config_fn, lowercase=lowercase)
84
+ # create an JsonHParams object with configuration dict
85
+ cfg = JsonHParams(**config_)
86
+ return cfg
87
+
88
+ class Extract_wav2vectbert:
89
+ def __init__(self,device):
90
+ #semantic_model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
91
+ self.semantic_model = Wav2Vec2BertModel.from_pretrained("./MaskGCT_model/w2v_bert/")
92
+ self.semantic_model.eval()
93
+ self.semantic_model.to(device)
94
+ self.stat_mean_var = torch.load("./MaskGCT_model/wav2vec2bert_stats.pt")
95
+ self.semantic_mean = self.stat_mean_var["mean"]
96
+ self.semantic_std = torch.sqrt(self.stat_mean_var["var"])
97
+ self.semantic_mean = self.semantic_mean.to(device)
98
+ self.semantic_std = self.semantic_std.to(device)
99
+ self.processor = SeamlessM4TFeatureExtractor.from_pretrained(
100
+ "./MaskGCT_model/w2v_bert/")
101
+ self.device = device
102
+
103
+ cfg_maskgct = load_config('./MaskGCT_model/maskgct.json')
104
+ cfg = cfg_maskgct.model.semantic_codec
105
+ self.semantic_code_ckpt = r'./MaskGCT_model/semantic_codec/model.safetensors'
106
+ self.semantic_codec = RepCodec(cfg=cfg)
107
+ self.semantic_codec.eval()
108
+ self.semantic_codec.to(device)
109
+ safetensors.torch.load_model(self.semantic_codec, self.semantic_code_ckpt)
110
+
111
+ @torch.no_grad()
112
+ def extract_features(self, speech): # speech [b,T]
113
+ inputs = self.processor(speech, sampling_rate=16000, return_tensors="pt")
114
+ input_features = inputs["input_features"]
115
+ attention_mask = inputs["attention_mask"]
116
+ return input_features, attention_mask #[2, 620, 160] [2, 620]
117
+
118
+ @torch.no_grad()
119
+ def extract_semantic_code(self, input_features, attention_mask):
120
+ vq_emb = self.semantic_model( # Wav2Vec2BertModel
121
+ input_features=input_features,
122
+ attention_mask=attention_mask,
123
+ output_hidden_states=True,
124
+ )
125
+ feat = vq_emb.hidden_states[17] # (B, T, C)
126
+ feat = (feat - self.semantic_mean.to(feat)) / self.semantic_std.to(feat)
127
+
128
+ semantic_code, rec_feat = self.semantic_codec.quantize(feat) # (B, T)
129
+ return semantic_code, rec_feat
130
+
131
+ def feature_extract(self, prompt_speech):
132
+
133
+ input_features, attention_mask = self.extract_features(prompt_speech)
134
+ input_features = input_features.to(self.device)
135
+ attention_mask = attention_mask.to(self.device)
136
+ semantic_code, rec_feat = self.extract_semantic_code(input_features, attention_mask)
137
+ return semantic_code,rec_feat
138
+
139
+ if __name__=='__main__':
140
+ speech_path = 'test/magi1.wav'
141
+ speech = librosa.load(speech_path, sr=16000)[0]
142
+ speech = np.c_[speech,speech,speech].T #[2, 198559]
143
+ print(speech.shape)
144
+
145
+ Extract_feature = Extract_wav2vectbert('cuda:0')
146
+ semantic_code,rec_feat = Extract_feature.feature_extract(speech)
147
+ print(semantic_code.shape,rec_feat.shape)
148
+
File without changes