xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +473 -31
  3. xinference/client/restful/async_restful_client.py +178 -8
  4. xinference/client/restful/restful_client.py +151 -3
  5. xinference/core/supervisor.py +99 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +58 -21
  11. xinference/model/image/model_spec.json +159 -90
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +6 -2
  14. xinference/model/llm/llm_family.json +1299 -174
  15. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  16. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  17. xinference/model/llm/sglang/core.py +44 -11
  18. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  19. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  20. xinference/model/llm/transformers/chatglm.py +3 -0
  21. xinference/model/llm/transformers/core.py +129 -36
  22. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  23. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  24. xinference/model/llm/transformers/utils.py +23 -0
  25. xinference/model/llm/utils.py +48 -32
  26. xinference/model/llm/vllm/core.py +207 -72
  27. xinference/model/utils.py +74 -31
  28. xinference/thirdparty/audiotools/__init__.py +10 -0
  29. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  30. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  31. xinference/thirdparty/audiotools/core/display.py +194 -0
  32. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  33. xinference/thirdparty/audiotools/core/effects.py +647 -0
  34. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  35. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  36. xinference/thirdparty/audiotools/core/playback.py +252 -0
  37. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  38. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  39. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  40. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  41. xinference/thirdparty/audiotools/core/util.py +671 -0
  42. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  43. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  44. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  45. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  46. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  47. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  48. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  49. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  50. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  51. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  52. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  53. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  54. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  55. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  56. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  57. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  58. xinference/thirdparty/audiotools/post.py +140 -0
  59. xinference/thirdparty/audiotools/preference.py +600 -0
  60. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  61. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  62. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  63. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  72. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  73. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  74. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  75. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  76. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  77. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  78. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  79. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  80. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  81. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  82. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  83. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  84. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  85. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  86. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  87. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  88. xinference/thirdparty/indextts/__init__.py +0 -0
  89. xinference/thirdparty/indextts/cli.py +65 -0
  90. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  91. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  92. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  93. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  94. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  95. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  96. xinference/thirdparty/indextts/gpt/model.py +713 -0
  97. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  98. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  99. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  100. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  101. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  102. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  103. xinference/thirdparty/indextts/infer.py +690 -0
  104. xinference/thirdparty/indextts/infer_v2.py +739 -0
  105. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  106. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  107. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  108. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  109. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  110. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  111. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  112. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  113. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  114. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  115. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  116. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  117. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  118. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  119. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  120. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  121. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  123. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  124. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  133. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  134. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  135. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  136. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  137. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  138. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  139. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  140. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  141. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  142. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  143. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  144. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  145. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  146. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  147. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  148. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  149. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  150. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  151. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  152. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  153. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  154. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  155. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  159. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  160. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  161. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  162. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  163. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  164. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  165. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  166. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  167. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  168. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  169. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  170. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  171. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  172. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  173. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  174. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  175. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  176. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  177. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  178. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  179. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  180. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  181. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  182. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  183. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  184. xinference/thirdparty/indextts/utils/common.py +121 -0
  185. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  186. xinference/thirdparty/indextts/utils/front.py +536 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  240. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  241. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  242. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  243. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  244. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  245. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  246. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  247. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  248. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  249. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  250. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  251. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  252. xinference/thirdparty/indextts/utils/utils.py +93 -0
  253. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  254. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  255. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  256. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  257. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  258. xinference/types.py +9 -0
  259. xinference/ui/gradio/media_interface.py +66 -8
  260. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  261. xinference/ui/web/ui/build/index.html +1 -1
  262. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  263. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  264. xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
  265. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
  266. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  273. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  274. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  275. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  276. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  277. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  278. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  279. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  280. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  281. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  282. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  283. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  284. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  285. xinference/ui/web/ui/package-lock.json +0 -34
  286. xinference/ui/web/ui/package.json +0 -1
  287. xinference/ui/web/ui/src/locales/en.json +9 -3
  288. xinference/ui/web/ui/src/locales/ja.json +9 -3
  289. xinference/ui/web/ui/src/locales/ko.json +9 -3
  290. xinference/ui/web/ui/src/locales/zh.json +9 -3
  291. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
  292. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
  293. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  294. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  295. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  296. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  302. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  303. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  304. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  305. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  306. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  307. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  308. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  309. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  310. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  311. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  312. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  313. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  314. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  315. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  316. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  317. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  318. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  319. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  320. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  321. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  322. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  323. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  324. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  325. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  326. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  327. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  328. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,647 @@
1
+ import typing
2
+
3
+ import julius
4
+ import numpy as np
5
+ import torch
6
+ import torchaudio
7
+
8
+ from . import util
9
+
10
+
11
+ class EffectMixin:
12
+ GAIN_FACTOR = np.log(10) / 20
13
+ """Gain factor for converting between amplitude and decibels."""
14
+ CODEC_PRESETS = {
15
+ "8-bit": {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
16
+ "GSM-FR": {"format": "gsm"},
17
+ "MP3": {"format": "mp3", "compression": -9},
18
+ "Vorbis": {"format": "vorbis", "compression": -1},
19
+ "Ogg": {
20
+ "format": "ogg",
21
+ "compression": -1,
22
+ },
23
+ "Amr-nb": {"format": "amr-nb"},
24
+ }
25
+ """Presets for applying codecs via torchaudio."""
26
+
27
+ def mix(
28
+ self,
29
+ other,
30
+ snr: typing.Union[torch.Tensor, np.ndarray, float] = 10,
31
+ other_eq: typing.Union[torch.Tensor, np.ndarray] = None,
32
+ ):
33
+ """Mixes noise with signal at specified
34
+ signal-to-noise ratio. Optionally, the
35
+ other signal can be equalized in-place.
36
+
37
+
38
+ Parameters
39
+ ----------
40
+ other : AudioSignal
41
+ AudioSignal object to mix with.
42
+ snr : typing.Union[torch.Tensor, np.ndarray, float], optional
43
+ Signal to noise ratio, by default 10
44
+ other_eq : typing.Union[torch.Tensor, np.ndarray], optional
45
+ EQ curve to apply to other signal, if any, by default None
46
+
47
+ Returns
48
+ -------
49
+ AudioSignal
50
+ In-place modification of AudioSignal.
51
+ """
52
+ snr = util.ensure_tensor(snr).to(self.device)
53
+
54
+ pad_len = max(0, self.signal_length - other.signal_length)
55
+ other.zero_pad(0, pad_len)
56
+ other.truncate_samples(self.signal_length)
57
+ if other_eq is not None:
58
+ other = other.equalizer(other_eq)
59
+
60
+ tgt_loudness = self.loudness() - snr
61
+ other = other.normalize(tgt_loudness)
62
+
63
+ self.audio_data = self.audio_data + other.audio_data
64
+ return self
65
+
66
+ def convolve(self, other, start_at_max: bool = True):
67
+ """Convolves self with other.
68
+ This function uses FFTs to do the convolution.
69
+
70
+ Parameters
71
+ ----------
72
+ other : AudioSignal
73
+ Signal to convolve with.
74
+ start_at_max : bool, optional
75
+ Whether to start at the max value of other signal, to
76
+ avoid inducing delays, by default True
77
+
78
+ Returns
79
+ -------
80
+ AudioSignal
81
+ Convolved signal, in-place.
82
+ """
83
+ from . import AudioSignal
84
+
85
+ pad_len = self.signal_length - other.signal_length
86
+
87
+ if pad_len > 0:
88
+ other.zero_pad(0, pad_len)
89
+ else:
90
+ other.truncate_samples(self.signal_length)
91
+
92
+ if start_at_max:
93
+ # Use roll to rotate over the max for every item
94
+ # so that the impulse responses don't induce any
95
+ # delay.
96
+ idx = other.audio_data.abs().argmax(axis=-1)
97
+ irs = torch.zeros_like(other.audio_data)
98
+ for i in range(other.batch_size):
99
+ irs[i] = torch.roll(other.audio_data[i], -idx[i].item(), -1)
100
+ other = AudioSignal(irs, other.sample_rate)
101
+
102
+ delta = torch.zeros_like(other.audio_data)
103
+ delta[..., 0] = 1
104
+
105
+ length = self.signal_length
106
+ delta_fft = torch.fft.rfft(delta, length)
107
+ other_fft = torch.fft.rfft(other.audio_data, length)
108
+ self_fft = torch.fft.rfft(self.audio_data, length)
109
+
110
+ convolved_fft = other_fft * self_fft
111
+ convolved_audio = torch.fft.irfft(convolved_fft, length)
112
+
113
+ delta_convolved_fft = other_fft * delta_fft
114
+ delta_audio = torch.fft.irfft(delta_convolved_fft, length)
115
+
116
+ # Use the delta to rescale the audio exactly as needed.
117
+ delta_max = delta_audio.abs().max(dim=-1, keepdims=True)[0]
118
+ scale = 1 / delta_max.clamp(1e-5)
119
+ convolved_audio = convolved_audio * scale
120
+
121
+ self.audio_data = convolved_audio
122
+
123
+ return self
124
+
125
+ def apply_ir(
126
+ self,
127
+ ir,
128
+ drr: typing.Union[torch.Tensor, np.ndarray, float] = None,
129
+ ir_eq: typing.Union[torch.Tensor, np.ndarray] = None,
130
+ use_original_phase: bool = False,
131
+ ):
132
+ """Applies an impulse response to the signal. If ` is`ir_eq``
133
+ is specified, the impulse response is equalized before
134
+ it is applied, using the given curve.
135
+
136
+ Parameters
137
+ ----------
138
+ ir : AudioSignal
139
+ Impulse response to convolve with.
140
+ drr : typing.Union[torch.Tensor, np.ndarray, float], optional
141
+ Direct-to-reverberant ratio that impulse response will be
142
+ altered to, if specified, by default None
143
+ ir_eq : typing.Union[torch.Tensor, np.ndarray], optional
144
+ Equalization that will be applied to impulse response
145
+ if specified, by default None
146
+ use_original_phase : bool, optional
147
+ Whether to use the original phase, instead of the convolved
148
+ phase, by default False
149
+
150
+ Returns
151
+ -------
152
+ AudioSignal
153
+ Signal with impulse response applied to it
154
+ """
155
+ if ir_eq is not None:
156
+ ir = ir.equalizer(ir_eq)
157
+ if drr is not None:
158
+ ir = ir.alter_drr(drr)
159
+
160
+ # Save the peak before
161
+ max_spk = self.audio_data.abs().max(dim=-1, keepdims=True).values
162
+
163
+ # Augment the impulse response to simulate microphone effects
164
+ # and with varying direct-to-reverberant ratio.
165
+ phase = self.phase
166
+ self.convolve(ir)
167
+
168
+ # Use the input phase
169
+ if use_original_phase:
170
+ self.stft()
171
+ self.stft_data = self.magnitude * torch.exp(1j * phase)
172
+ self.istft()
173
+
174
+ # Rescale to the input's amplitude
175
+ max_transformed = self.audio_data.abs().max(dim=-1, keepdims=True).values
176
+ scale_factor = max_spk.clamp(1e-8) / max_transformed.clamp(1e-8)
177
+ self = self * scale_factor
178
+
179
+ return self
180
+
181
+ def ensure_max_of_audio(self, max: float = 1.0):
182
+ """Ensures that ``abs(audio_data) <= max``.
183
+
184
+ Parameters
185
+ ----------
186
+ max : float, optional
187
+ Max absolute value of signal, by default 1.0
188
+
189
+ Returns
190
+ -------
191
+ AudioSignal
192
+ Signal with values scaled between -max and max.
193
+ """
194
+ peak = self.audio_data.abs().max(dim=-1, keepdims=True)[0]
195
+ peak_gain = torch.ones_like(peak)
196
+ peak_gain[peak > max] = max / peak[peak > max]
197
+ self.audio_data = self.audio_data * peak_gain
198
+ return self
199
+
200
+ def normalize(self, db: typing.Union[torch.Tensor, np.ndarray, float] = -24.0):
201
+ """Normalizes the signal's volume to the specified db, in LUFS.
202
+ This is GPU-compatible, making for very fast loudness normalization.
203
+
204
+ Parameters
205
+ ----------
206
+ db : typing.Union[torch.Tensor, np.ndarray, float], optional
207
+ Loudness to normalize to, by default -24.0
208
+
209
+ Returns
210
+ -------
211
+ AudioSignal
212
+ Normalized audio signal.
213
+ """
214
+ db = util.ensure_tensor(db).to(self.device)
215
+ ref_db = self.loudness()
216
+ gain = db - ref_db
217
+ gain = torch.exp(gain * self.GAIN_FACTOR)
218
+
219
+ self.audio_data = self.audio_data * gain[:, None, None]
220
+ return self
221
+
222
+ def volume_change(self, db: typing.Union[torch.Tensor, np.ndarray, float]):
223
+ """Change volume of signal by some amount, in dB.
224
+
225
+ Parameters
226
+ ----------
227
+ db : typing.Union[torch.Tensor, np.ndarray, float]
228
+ Amount to change volume by.
229
+
230
+ Returns
231
+ -------
232
+ AudioSignal
233
+ Signal at new volume.
234
+ """
235
+ db = util.ensure_tensor(db, ndim=1).to(self.device)
236
+ gain = torch.exp(db * self.GAIN_FACTOR)
237
+ self.audio_data = self.audio_data * gain[:, None, None]
238
+ return self
239
+
240
+ def _to_2d(self):
241
+ waveform = self.audio_data.reshape(-1, self.signal_length)
242
+ return waveform
243
+
244
+ def _to_3d(self, waveform):
245
+ return waveform.reshape(self.batch_size, self.num_channels, -1)
246
+
247
+ def pitch_shift(self, n_semitones: int, quick: bool = True):
248
+ """Pitch shift the signal. All items in the batch
249
+ get the same pitch shift.
250
+
251
+ Parameters
252
+ ----------
253
+ n_semitones : int
254
+ How many semitones to shift the signal by.
255
+ quick : bool, optional
256
+ Using quick pitch shifting, by default True
257
+
258
+ Returns
259
+ -------
260
+ AudioSignal
261
+ Pitch shifted audio signal.
262
+ """
263
+ device = self.device
264
+ effects = [
265
+ ["pitch", str(n_semitones * 100)],
266
+ ["rate", str(self.sample_rate)],
267
+ ]
268
+ if quick:
269
+ effects[0].insert(1, "-q")
270
+
271
+ waveform = self._to_2d().cpu()
272
+ waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
273
+ waveform, self.sample_rate, effects, channels_first=True
274
+ )
275
+ self.sample_rate = sample_rate
276
+ self.audio_data = self._to_3d(waveform)
277
+ return self.to(device)
278
+
279
+ def time_stretch(self, factor: float, quick: bool = True):
280
+ """Time stretch the audio signal.
281
+
282
+ Parameters
283
+ ----------
284
+ factor : float
285
+ Factor by which to stretch the AudioSignal. Typically
286
+ between 0.8 and 1.2.
287
+ quick : bool, optional
288
+ Whether to use quick time stretching, by default True
289
+
290
+ Returns
291
+ -------
292
+ AudioSignal
293
+ Time-stretched AudioSignal.
294
+ """
295
+ device = self.device
296
+ effects = [
297
+ ["tempo", str(factor)],
298
+ ["rate", str(self.sample_rate)],
299
+ ]
300
+ if quick:
301
+ effects[0].insert(1, "-q")
302
+
303
+ waveform = self._to_2d().cpu()
304
+ waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
305
+ waveform, self.sample_rate, effects, channels_first=True
306
+ )
307
+ self.sample_rate = sample_rate
308
+ self.audio_data = self._to_3d(waveform)
309
+ return self.to(device)
310
+
311
+ def apply_codec(
312
+ self,
313
+ preset: str = None,
314
+ format: str = "wav",
315
+ encoding: str = None,
316
+ bits_per_sample: int = None,
317
+ compression: int = None,
318
+ ): # pragma: no cover
319
+ """Applies an audio codec to the signal.
320
+
321
+ Parameters
322
+ ----------
323
+ preset : str, optional
324
+ One of the keys in ``self.CODEC_PRESETS``, by default None
325
+ format : str, optional
326
+ Format for audio codec, by default "wav"
327
+ encoding : str, optional
328
+ Encoding to use, by default None
329
+ bits_per_sample : int, optional
330
+ How many bits per sample, by default None
331
+ compression : int, optional
332
+ Compression amount of codec, by default None
333
+
334
+ Returns
335
+ -------
336
+ AudioSignal
337
+ AudioSignal with codec applied.
338
+
339
+ Raises
340
+ ------
341
+ ValueError
342
+ If preset is not in ``self.CODEC_PRESETS``, an error
343
+ is thrown.
344
+ """
345
+ torchaudio_version_070 = "0.7" in torchaudio.__version__
346
+ if torchaudio_version_070:
347
+ return self
348
+
349
+ kwargs = {
350
+ "format": format,
351
+ "encoding": encoding,
352
+ "bits_per_sample": bits_per_sample,
353
+ "compression": compression,
354
+ }
355
+
356
+ if preset is not None:
357
+ if preset in self.CODEC_PRESETS:
358
+ kwargs = self.CODEC_PRESETS[preset]
359
+ else:
360
+ raise ValueError(
361
+ f"Unknown preset: {preset}. "
362
+ f"Known presets: {list(self.CODEC_PRESETS.keys())}"
363
+ )
364
+
365
+ waveform = self._to_2d()
366
+ if kwargs["format"] in ["vorbis", "mp3", "ogg", "amr-nb"]:
367
+ # Apply it in a for loop
368
+ augmented = torch.cat(
369
+ [
370
+ torchaudio.functional.apply_codec(
371
+ waveform[i][None, :], self.sample_rate, **kwargs
372
+ )
373
+ for i in range(waveform.shape[0])
374
+ ],
375
+ dim=0,
376
+ )
377
+ else:
378
+ augmented = torchaudio.functional.apply_codec(
379
+ waveform, self.sample_rate, **kwargs
380
+ )
381
+ augmented = self._to_3d(augmented)
382
+
383
+ self.audio_data = augmented
384
+ return self
385
+
386
+ def mel_filterbank(self, n_bands: int):
387
+ """Breaks signal into mel bands.
388
+
389
+ Parameters
390
+ ----------
391
+ n_bands : int
392
+ Number of mel bands to use.
393
+
394
+ Returns
395
+ -------
396
+ torch.Tensor
397
+ Mel-filtered bands, with last axis being the band index.
398
+ """
399
+ filterbank = (
400
+ julius.SplitBands(self.sample_rate, n_bands).float().to(self.device)
401
+ )
402
+ filtered = filterbank(self.audio_data)
403
+ return filtered.permute(1, 2, 3, 0)
404
+
405
+ def equalizer(self, db: typing.Union[torch.Tensor, np.ndarray]):
406
+ """Applies a mel-spaced equalizer to the audio signal.
407
+
408
+ Parameters
409
+ ----------
410
+ db : typing.Union[torch.Tensor, np.ndarray]
411
+ EQ curve to apply.
412
+
413
+ Returns
414
+ -------
415
+ AudioSignal
416
+ AudioSignal with equalization applied.
417
+ """
418
+ db = util.ensure_tensor(db)
419
+ n_bands = db.shape[-1]
420
+ fbank = self.mel_filterbank(n_bands)
421
+
422
+ # If there's a batch dimension, make sure it's the same.
423
+ if db.ndim == 2:
424
+ if db.shape[0] != 1:
425
+ assert db.shape[0] == fbank.shape[0]
426
+ else:
427
+ db = db.unsqueeze(0)
428
+
429
+ weights = (10**db).to(self.device).float()
430
+ fbank = fbank * weights[:, None, None, :]
431
+ eq_audio_data = fbank.sum(-1)
432
+ self.audio_data = eq_audio_data
433
+ return self
434
+
435
+ def clip_distortion(
436
+ self, clip_percentile: typing.Union[torch.Tensor, np.ndarray, float]
437
+ ):
438
+ """Clips the signal at a given percentile. The higher it is,
439
+ the lower the threshold for clipping.
440
+
441
+ Parameters
442
+ ----------
443
+ clip_percentile : typing.Union[torch.Tensor, np.ndarray, float]
444
+ Values are between 0.0 to 1.0. Typical values are 0.1 or below.
445
+
446
+ Returns
447
+ -------
448
+ AudioSignal
449
+ Audio signal with clipped audio data.
450
+ """
451
+ clip_percentile = util.ensure_tensor(clip_percentile, ndim=1)
452
+ min_thresh = torch.quantile(self.audio_data, clip_percentile / 2, dim=-1)
453
+ max_thresh = torch.quantile(self.audio_data, 1 - (clip_percentile / 2), dim=-1)
454
+
455
+ nc = self.audio_data.shape[1]
456
+ min_thresh = min_thresh[:, :nc, :]
457
+ max_thresh = max_thresh[:, :nc, :]
458
+
459
+ self.audio_data = self.audio_data.clamp(min_thresh, max_thresh)
460
+
461
+ return self
462
+
463
+ def quantization(
464
+ self, quantization_channels: typing.Union[torch.Tensor, np.ndarray, int]
465
+ ):
466
+ """Applies quantization to the input waveform.
467
+
468
+ Parameters
469
+ ----------
470
+ quantization_channels : typing.Union[torch.Tensor, np.ndarray, int]
471
+ Number of evenly spaced quantization channels to quantize
472
+ to.
473
+
474
+ Returns
475
+ -------
476
+ AudioSignal
477
+ Quantized AudioSignal.
478
+ """
479
+ quantization_channels = util.ensure_tensor(quantization_channels, ndim=3)
480
+
481
+ x = self.audio_data
482
+ x = (x + 1) / 2
483
+ x = x * quantization_channels
484
+ x = x.floor()
485
+ x = x / quantization_channels
486
+ x = 2 * x - 1
487
+
488
+ residual = (self.audio_data - x).detach()
489
+ self.audio_data = self.audio_data - residual
490
+ return self
491
+
492
+ def mulaw_quantization(
493
+ self, quantization_channels: typing.Union[torch.Tensor, np.ndarray, int]
494
+ ):
495
+ """Applies mu-law quantization to the input waveform.
496
+
497
+ Parameters
498
+ ----------
499
+ quantization_channels : typing.Union[torch.Tensor, np.ndarray, int]
500
+ Number of mu-law spaced quantization channels to quantize
501
+ to.
502
+
503
+ Returns
504
+ -------
505
+ AudioSignal
506
+ Quantized AudioSignal.
507
+ """
508
+ mu = quantization_channels - 1.0
509
+ mu = util.ensure_tensor(mu, ndim=3)
510
+
511
+ x = self.audio_data
512
+
513
+ # quantize
514
+ x = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu)
515
+ x = ((x + 1) / 2 * mu + 0.5).to(torch.int64)
516
+
517
+ # unquantize
518
+ x = (x / mu) * 2 - 1.0
519
+ x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu
520
+
521
+ residual = (self.audio_data - x).detach()
522
+ self.audio_data = self.audio_data - residual
523
+ return self
524
+
525
+ def __matmul__(self, other):
526
+ return self.convolve(other)
527
+
528
+
529
+ class ImpulseResponseMixin:
530
+ """These functions are generally only used with AudioSignals that are derived
531
+ from impulse responses, not other sources like music or speech. These methods
532
+ are used to replicate the data augmentation described in [1].
533
+
534
+ 1. Bryan, Nicholas J. "Impulse response data augmentation and deep
535
+ neural networks for blind room acoustic parameter estimation."
536
+ ICASSP 2020-2020 IEEE International Conference on Acoustics,
537
+ Speech and Signal Processing (ICASSP). IEEE, 2020.
538
+ """
539
+
540
+ def decompose_ir(self):
541
+ """Decomposes an impulse response into early and late
542
+ field responses.
543
+ """
544
+ # Equations 1 and 2
545
+ # -----------------
546
+ # Breaking up into early
547
+ # response + late field response.
548
+
549
+ td = torch.argmax(self.audio_data, dim=-1, keepdim=True)
550
+ t0 = int(self.sample_rate * 0.0025)
551
+
552
+ idx = torch.arange(self.audio_data.shape[-1], device=self.device)[None, None, :]
553
+ idx = idx.expand(self.batch_size, -1, -1)
554
+ early_idx = (idx >= td - t0) * (idx <= td + t0)
555
+
556
+ early_response = torch.zeros_like(self.audio_data, device=self.device)
557
+ early_response[early_idx] = self.audio_data[early_idx]
558
+
559
+ late_idx = ~early_idx
560
+ late_field = torch.zeros_like(self.audio_data, device=self.device)
561
+ late_field[late_idx] = self.audio_data[late_idx]
562
+
563
+ # Equation 4
564
+ # ----------
565
+ # Decompose early response into windowed
566
+ # direct path and windowed residual.
567
+
568
+ window = torch.zeros_like(self.audio_data, device=self.device)
569
+ for idx in range(self.batch_size):
570
+ window_idx = early_idx[idx, 0].nonzero()
571
+ window[idx, ..., window_idx] = self.get_window(
572
+ "hann", window_idx.shape[-1], self.device
573
+ )
574
+ return early_response, late_field, window
575
+
576
+ def measure_drr(self):
577
+ """Measures the direct-to-reverberant ratio of the impulse
578
+ response.
579
+
580
+ Returns
581
+ -------
582
+ float
583
+ Direct-to-reverberant ratio
584
+ """
585
+ early_response, late_field, _ = self.decompose_ir()
586
+ num = (early_response**2).sum(dim=-1)
587
+ den = (late_field**2).sum(dim=-1)
588
+ drr = 10 * torch.log10(num / den)
589
+ return drr
590
+
591
+ @staticmethod
592
+ def solve_alpha(early_response, late_field, wd, target_drr):
593
+ """Used to solve for the alpha value, which is used
594
+ to alter the drr.
595
+ """
596
+ # Equation 5
597
+ # ----------
598
+ # Apply the good ol' quadratic formula.
599
+
600
+ wd_sq = wd**2
601
+ wd_sq_1 = (1 - wd) ** 2
602
+ e_sq = early_response**2
603
+ l_sq = late_field**2
604
+ a = (wd_sq * e_sq).sum(dim=-1)
605
+ b = (2 * (1 - wd) * wd * e_sq).sum(dim=-1)
606
+ c = (wd_sq_1 * e_sq).sum(dim=-1) - torch.pow(10, target_drr / 10) * l_sq.sum(
607
+ dim=-1
608
+ )
609
+
610
+ expr = ((b**2) - 4 * a * c).sqrt()
611
+ alpha = torch.maximum(
612
+ (-b - expr) / (2 * a),
613
+ (-b + expr) / (2 * a),
614
+ )
615
+ return alpha
616
+
617
+ def alter_drr(self, drr: typing.Union[torch.Tensor, np.ndarray, float]):
618
+ """Alters the direct-to-reverberant ratio of the impulse response.
619
+
620
+ Parameters
621
+ ----------
622
+ drr : typing.Union[torch.Tensor, np.ndarray, float]
623
+ Direct-to-reverberant ratio that impulse response will be
624
+ altered to, if specified, by default None
625
+
626
+ Returns
627
+ -------
628
+ AudioSignal
629
+ Altered impulse response.
630
+ """
631
+ drr = util.ensure_tensor(drr, 2, self.batch_size).to(self.device)
632
+
633
+ early_response, late_field, window = self.decompose_ir()
634
+ alpha = self.solve_alpha(early_response, late_field, window, drr)
635
+ min_alpha = (
636
+ late_field.abs().max(dim=-1)[0] / early_response.abs().max(dim=-1)[0]
637
+ )
638
+ alpha = torch.maximum(alpha, min_alpha)[..., None]
639
+
640
+ aug_ir_data = (
641
+ alpha * window * early_response
642
+ + ((1 - window) * early_response)
643
+ + late_field
644
+ )
645
+ self.audio_data = aug_ir_data
646
+ self.ensure_max_of_audio()
647
+ return self