xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +473 -31
  3. xinference/client/restful/async_restful_client.py +178 -8
  4. xinference/client/restful/restful_client.py +151 -3
  5. xinference/core/supervisor.py +99 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +58 -21
  11. xinference/model/image/model_spec.json +159 -90
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +6 -2
  14. xinference/model/llm/llm_family.json +1299 -174
  15. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  16. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  17. xinference/model/llm/sglang/core.py +44 -11
  18. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  19. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  20. xinference/model/llm/transformers/chatglm.py +3 -0
  21. xinference/model/llm/transformers/core.py +129 -36
  22. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  23. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  24. xinference/model/llm/transformers/utils.py +23 -0
  25. xinference/model/llm/utils.py +48 -32
  26. xinference/model/llm/vllm/core.py +207 -72
  27. xinference/model/utils.py +74 -31
  28. xinference/thirdparty/audiotools/__init__.py +10 -0
  29. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  30. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  31. xinference/thirdparty/audiotools/core/display.py +194 -0
  32. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  33. xinference/thirdparty/audiotools/core/effects.py +647 -0
  34. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  35. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  36. xinference/thirdparty/audiotools/core/playback.py +252 -0
  37. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  38. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  39. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  40. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  41. xinference/thirdparty/audiotools/core/util.py +671 -0
  42. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  43. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  44. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  45. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  46. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  47. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  48. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  49. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  50. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  51. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  52. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  53. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  54. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  55. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  56. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  57. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  58. xinference/thirdparty/audiotools/post.py +140 -0
  59. xinference/thirdparty/audiotools/preference.py +600 -0
  60. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  61. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  62. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  63. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  72. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  73. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  74. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  75. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  76. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  77. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  78. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  79. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  80. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  81. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  82. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  83. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  84. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  85. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  86. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  87. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  88. xinference/thirdparty/indextts/__init__.py +0 -0
  89. xinference/thirdparty/indextts/cli.py +65 -0
  90. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  91. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  92. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  93. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  94. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  95. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  96. xinference/thirdparty/indextts/gpt/model.py +713 -0
  97. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  98. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  99. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  100. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  101. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  102. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  103. xinference/thirdparty/indextts/infer.py +690 -0
  104. xinference/thirdparty/indextts/infer_v2.py +739 -0
  105. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  106. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  107. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  108. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  109. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  110. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  111. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  112. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  113. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  114. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  115. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  116. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  117. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  118. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  119. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  120. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  121. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  123. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  124. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  133. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  134. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  135. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  136. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  137. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  138. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  139. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  140. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  141. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  142. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  143. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  144. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  145. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  146. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  147. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  148. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  149. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  150. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  151. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  152. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  153. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  154. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  155. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  159. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  160. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  161. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  162. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  163. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  164. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  165. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  166. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  167. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  168. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  169. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  170. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  171. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  172. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  173. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  174. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  175. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  176. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  177. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  178. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  179. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  180. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  181. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  182. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  183. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  184. xinference/thirdparty/indextts/utils/common.py +121 -0
  185. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  186. xinference/thirdparty/indextts/utils/front.py +536 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  240. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  241. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  242. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  243. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  244. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  245. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  246. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  247. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  248. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  249. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  250. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  251. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  252. xinference/thirdparty/indextts/utils/utils.py +93 -0
  253. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  254. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  255. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  256. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  257. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  258. xinference/types.py +9 -0
  259. xinference/ui/gradio/media_interface.py +66 -8
  260. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  261. xinference/ui/web/ui/build/index.html +1 -1
  262. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  263. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  264. xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
  265. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
  266. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  273. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  274. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  275. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  276. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  277. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  278. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  279. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  280. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  281. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  282. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  283. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  284. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  285. xinference/ui/web/ui/package-lock.json +0 -34
  286. xinference/ui/web/ui/package.json +0 -1
  287. xinference/ui/web/ui/src/locales/en.json +9 -3
  288. xinference/ui/web/ui/src/locales/ja.json +9 -3
  289. xinference/ui/web/ui/src/locales/ko.json +9 -3
  290. xinference/ui/web/ui/src/locales/zh.json +9 -3
  291. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
  292. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
  293. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  294. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  295. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  296. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  302. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  303. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  304. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  305. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  306. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  307. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  308. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  309. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  310. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  311. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  312. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  313. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  314. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  315. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  316. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  317. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  318. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  319. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  320. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  321. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  322. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  323. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  324. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  325. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  326. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  327. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  328. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1592 @@
1
+ import copy
2
+ from contextlib import contextmanager
3
+ from inspect import signature
4
+ from typing import List
5
+
6
+ import numpy as np
7
+ import torch
8
+ from flatten_dict import flatten
9
+ from flatten_dict import unflatten
10
+ from numpy.random import RandomState
11
+
12
+ from .. import ml
13
+ from ..core import AudioSignal
14
+ from ..core import util
15
+ from .datasets import AudioLoader
16
+
17
+ tt = torch.tensor
18
+ """Shorthand for converting things to torch.tensor."""
19
+
20
+
21
+ class BaseTransform:
22
+ """This is the base class for all transforms that are implemented
23
+ in this library. Transforms have two main operations: ``transform``
24
+ and ``instantiate``.
25
+
26
+ ``instantiate`` sets the parameters randomly
27
+ from distribution tuples for each parameter. For example, for the
28
+ ``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``)
29
+ is chosen randomly by instantiate. By default, it chosen uniformly
30
+ between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``).
31
+
32
+ ``transform`` applies the transform using the instantiated parameters.
33
+ A simple example is as follows:
34
+
35
+ >>> seed = 0
36
+ >>> signal = ...
37
+ >>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0))
38
+ >>> kwargs = transform.instantiate()
39
+ >>> output = transform(signal.clone(), **kwargs)
40
+
41
+ By breaking apart the instantiation of parameters from the actual audio
42
+ processing of the transform, we can make things more reproducible, while
43
+ also applying the transform on batches of data efficiently on GPU,
44
+ rather than on individual audio samples.
45
+
46
+ .. note::
47
+ We call ``signal.clone()`` for the input to the ``transform`` function
48
+ because signals are modified in-place! If you don't clone the signal,
49
+ you will lose the original data.
50
+
51
+ Parameters
52
+ ----------
53
+ keys : list, optional
54
+ Keys that the transform looks for when
55
+ calling ``self.transform``, by default []. In general this is
56
+ set automatically, and you won't need to manipulate this argument.
57
+ name : str, optional
58
+ Name of this transform, used to identify it in the dictionary
59
+ produced by ``self.instantiate``, by default None
60
+ prob : float, optional
61
+ Probability of applying this transform, by default 1.0
62
+
63
+ Examples
64
+ --------
65
+
66
+ >>> seed = 0
67
+ >>>
68
+ >>> audio_path = "tests/audio/spk/f10_script4_produced.wav"
69
+ >>> signal = AudioSignal(audio_path, offset=10, duration=2)
70
+ >>> transform = tfm.Compose(
71
+ >>> [
72
+ >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
73
+ >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
74
+ >>> ],
75
+ >>> )
76
+ >>>
77
+ >>> kwargs = transform.instantiate(seed, signal)
78
+ >>> output = transform(signal, **kwargs)
79
+
80
+ """
81
+
82
+ def __init__(self, keys: list = [], name: str = None, prob: float = 1.0):
83
+ # Get keys from the _transform signature.
84
+ tfm_keys = list(signature(self._transform).parameters.keys())
85
+
86
+ # Filter out signal and kwargs keys.
87
+ ignore_keys = ["signal", "kwargs"]
88
+ tfm_keys = [k for k in tfm_keys if k not in ignore_keys]
89
+
90
+ # Combine keys specified by the child class, the keys found in
91
+ # _transform signature, and the mask key.
92
+ self.keys = keys + tfm_keys + ["mask"]
93
+
94
+ self.prob = prob
95
+
96
+ if name is None:
97
+ name = self.__class__.__name__
98
+ self.name = name
99
+
100
+ def _prepare(self, batch: dict):
101
+ sub_batch = batch[self.name]
102
+
103
+ for k in self.keys:
104
+ assert k in sub_batch.keys(), f"{k} not in batch"
105
+
106
+ return sub_batch
107
+
108
+ def _transform(self, signal):
109
+ return signal
110
+
111
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
112
+ return {}
113
+
114
+ @staticmethod
115
+ def apply_mask(batch: dict, mask: torch.Tensor):
116
+ """Applies a mask to the batch.
117
+
118
+ Parameters
119
+ ----------
120
+ batch : dict
121
+ Batch whose values will be masked in the ``transform`` pass.
122
+ mask : torch.Tensor
123
+ Mask to apply to batch.
124
+
125
+ Returns
126
+ -------
127
+ dict
128
+ A dictionary that contains values only where ``mask = True``.
129
+ """
130
+ masked_batch = {k: v[mask] for k, v in flatten(batch).items()}
131
+ return unflatten(masked_batch)
132
+
133
+ def transform(self, signal: AudioSignal, **kwargs):
134
+ """Apply the transform to the audio signal,
135
+ with given keyword arguments.
136
+
137
+ Parameters
138
+ ----------
139
+ signal : AudioSignal
140
+ Signal that will be modified by the transforms in-place.
141
+ kwargs: dict
142
+ Keyword arguments to the specific transforms ``self._transform``
143
+ function.
144
+
145
+ Returns
146
+ -------
147
+ AudioSignal
148
+ Transformed AudioSignal.
149
+
150
+ Examples
151
+ --------
152
+
153
+ >>> for seed in range(10):
154
+ >>> kwargs = transform.instantiate(seed, signal)
155
+ >>> output = transform(signal.clone(), **kwargs)
156
+
157
+ """
158
+ tfm_kwargs = self._prepare(kwargs)
159
+ mask = tfm_kwargs["mask"]
160
+
161
+ if torch.any(mask):
162
+ tfm_kwargs = self.apply_mask(tfm_kwargs, mask)
163
+ tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"}
164
+ signal[mask] = self._transform(signal[mask], **tfm_kwargs)
165
+
166
+ return signal
167
+
168
+ def __call__(self, *args, **kwargs):
169
+ return self.transform(*args, **kwargs)
170
+
171
+ def instantiate(
172
+ self,
173
+ state: RandomState = None,
174
+ signal: AudioSignal = None,
175
+ ):
176
+ """Instantiates parameters for the transform.
177
+
178
+ Parameters
179
+ ----------
180
+ state : RandomState, optional
181
+ _description_, by default None
182
+ signal : AudioSignal, optional
183
+ _description_, by default None
184
+
185
+ Returns
186
+ -------
187
+ dict
188
+ Dictionary containing instantiated arguments for every keyword
189
+ argument to ``self._transform``.
190
+
191
+ Examples
192
+ --------
193
+
194
+ >>> for seed in range(10):
195
+ >>> kwargs = transform.instantiate(seed, signal)
196
+ >>> output = transform(signal.clone(), **kwargs)
197
+
198
+ """
199
+ state = util.random_state(state)
200
+
201
+ # Not all instantiates need the signal. Check if signal
202
+ # is needed before passing it in, so that the end-user
203
+ # doesn't need to have variables they're not using flowing
204
+ # into their function.
205
+ needs_signal = "signal" in set(signature(self._instantiate).parameters.keys())
206
+ kwargs = {}
207
+ if needs_signal:
208
+ kwargs = {"signal": signal}
209
+
210
+ # Instantiate the parameters for the transform.
211
+ params = self._instantiate(state, **kwargs)
212
+ for k in list(params.keys()):
213
+ v = params[k]
214
+ if isinstance(v, (AudioSignal, torch.Tensor, dict)):
215
+ params[k] = v
216
+ else:
217
+ params[k] = tt(v)
218
+ mask = state.rand() <= self.prob
219
+ params[f"mask"] = tt(mask)
220
+
221
+ # Put the params into a nested dictionary that will be
222
+ # used later when calling the transform. This is to avoid
223
+ # collisions in the dictionary.
224
+ params = {self.name: params}
225
+
226
+ return params
227
+
228
+ def batch_instantiate(
229
+ self,
230
+ states: list = None,
231
+ signal: AudioSignal = None,
232
+ ):
233
+ """Instantiates arguments for every item in a batch,
234
+ given a list of states. Each state in the list
235
+ corresponds to one item in the batch.
236
+
237
+ Parameters
238
+ ----------
239
+ states : list, optional
240
+ List of states, by default None
241
+ signal : AudioSignal, optional
242
+ AudioSignal to pass to the ``self.instantiate`` section
243
+ if it is needed for this transform, by default None
244
+
245
+ Returns
246
+ -------
247
+ dict
248
+ Collated dictionary of arguments.
249
+
250
+ Examples
251
+ --------
252
+
253
+ >>> batch_size = 4
254
+ >>> signal = AudioSignal(audio_path, offset=10, duration=2)
255
+ >>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)])
256
+ >>>
257
+ >>> states = [seed + idx for idx in list(range(batch_size))]
258
+ >>> kwargs = transform.batch_instantiate(states, signal_batch)
259
+ >>> batch_output = transform(signal_batch, **kwargs)
260
+ """
261
+ kwargs = []
262
+ for state in states:
263
+ kwargs.append(self.instantiate(state, signal))
264
+ kwargs = util.collate(kwargs)
265
+ return kwargs
266
+
267
+
268
+ class Identity(BaseTransform):
269
+ """This transform just returns the original signal."""
270
+
271
+ pass
272
+
273
+
274
+ class SpectralTransform(BaseTransform):
275
+ """Spectral transforms require STFT data to exist, since manipulations
276
+ of the STFT require the spectrogram. This just calls ``stft`` before
277
+ the transform is called, and calls ``istft`` after the transform is
278
+ called so that the audio data is written to after the spectral
279
+ manipulation.
280
+ """
281
+
282
+ def transform(self, signal, **kwargs):
283
+ signal.stft()
284
+ super().transform(signal, **kwargs)
285
+ signal.istft()
286
+ return signal
287
+
288
+
289
+ class Compose(BaseTransform):
290
+ """Compose applies transforms in sequence, one after the other. The
291
+ transforms are passed in as positional arguments or as a list like so:
292
+
293
+ >>> transform = tfm.Compose(
294
+ >>> [
295
+ >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
296
+ >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
297
+ >>> ],
298
+ >>> )
299
+
300
+ This will convolve the signal with a room impulse response, and then
301
+ add background noise to the signal. Instantiate instantiates
302
+ all the parameters for every transform in the transform list so the
303
+ interface for using the Compose transform is the same as everything
304
+ else:
305
+
306
+ >>> kwargs = transform.instantiate()
307
+ >>> output = transform(signal.clone(), **kwargs)
308
+
309
+ Under the hood, the transform maps each transform to a unique name
310
+ under the hood of the form ``{position}.{name}``, where ``position``
311
+ is the index of the transform in the list. ``Compose`` can nest
312
+ within other ``Compose`` transforms, like so:
313
+
314
+ >>> preprocess = transforms.Compose(
315
+ >>> tfm.GlobalVolumeNorm(),
316
+ >>> tfm.CrossTalk(),
317
+ >>> name="preprocess",
318
+ >>> )
319
+ >>> augment = transforms.Compose(
320
+ >>> tfm.RoomImpulseResponse(),
321
+ >>> tfm.BackgroundNoise(),
322
+ >>> name="augment",
323
+ >>> )
324
+ >>> postprocess = transforms.Compose(
325
+ >>> tfm.VolumeChange(),
326
+ >>> tfm.RescaleAudio(),
327
+ >>> tfm.ShiftPhase(),
328
+ >>> name="postprocess",
329
+ >>> )
330
+ >>> transform = transforms.Compose(preprocess, augment, postprocess),
331
+
332
+ This defines 3 composed transforms, and then composes them in sequence
333
+ with one another.
334
+
335
+ Parameters
336
+ ----------
337
+ *transforms : list
338
+ List of transforms to apply
339
+ name : str, optional
340
+ Name of this transform, used to identify it in the dictionary
341
+ produced by ``self.instantiate``, by default None
342
+ prob : float, optional
343
+ Probability of applying this transform, by default 1.0
344
+ """
345
+
346
+ def __init__(self, *transforms: list, name: str = None, prob: float = 1.0):
347
+ if isinstance(transforms[0], list):
348
+ transforms = transforms[0]
349
+
350
+ for i, tfm in enumerate(transforms):
351
+ tfm.name = f"{i}.{tfm.name}"
352
+
353
+ keys = [tfm.name for tfm in transforms]
354
+ super().__init__(keys=keys, name=name, prob=prob)
355
+
356
+ self.transforms = transforms
357
+ self.transforms_to_apply = keys
358
+
359
+ @contextmanager
360
+ def filter(self, *names: list):
361
+ """This can be used to skip transforms entirely when applying
362
+ the sequence of transforms to a signal. For example, take
363
+ the following transforms with the names ``preprocess, augment, postprocess``.
364
+
365
+ >>> preprocess = transforms.Compose(
366
+ >>> tfm.GlobalVolumeNorm(),
367
+ >>> tfm.CrossTalk(),
368
+ >>> name="preprocess",
369
+ >>> )
370
+ >>> augment = transforms.Compose(
371
+ >>> tfm.RoomImpulseResponse(),
372
+ >>> tfm.BackgroundNoise(),
373
+ >>> name="augment",
374
+ >>> )
375
+ >>> postprocess = transforms.Compose(
376
+ >>> tfm.VolumeChange(),
377
+ >>> tfm.RescaleAudio(),
378
+ >>> tfm.ShiftPhase(),
379
+ >>> name="postprocess",
380
+ >>> )
381
+ >>> transform = transforms.Compose(preprocess, augment, postprocess)
382
+
383
+ If we wanted to apply all 3 to a signal, we do:
384
+
385
+ >>> kwargs = transform.instantiate()
386
+ >>> output = transform(signal.clone(), **kwargs)
387
+
388
+ But if we only wanted to apply the ``preprocess`` and ``postprocess``
389
+ transforms to the signal, we do:
390
+
391
+ >>> with transform_fn.filter("preprocess", "postprocess"):
392
+ >>> output = transform(signal.clone(), **kwargs)
393
+
394
+ Parameters
395
+ ----------
396
+ *names : list
397
+ List of transforms, identified by name, to apply to signal.
398
+ """
399
+ old_transforms = self.transforms_to_apply
400
+ self.transforms_to_apply = names
401
+ yield
402
+ self.transforms_to_apply = old_transforms
403
+
404
+ def _transform(self, signal, **kwargs):
405
+ for transform in self.transforms:
406
+ if any([x in transform.name for x in self.transforms_to_apply]):
407
+ signal = transform(signal, **kwargs)
408
+ return signal
409
+
410
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
411
+ parameters = {}
412
+ for transform in self.transforms:
413
+ parameters.update(transform.instantiate(state, signal=signal))
414
+ return parameters
415
+
416
+ def __getitem__(self, idx):
417
+ return self.transforms[idx]
418
+
419
+ def __len__(self):
420
+ return len(self.transforms)
421
+
422
+ def __iter__(self):
423
+ for transform in self.transforms:
424
+ yield transform
425
+
426
+
427
+ class Choose(Compose):
428
+ """Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`,
429
+ but instead of applying all the transforms in sequence, it applies just a single transform,
430
+ which is chosen for each item in the batch.
431
+
432
+ Parameters
433
+ ----------
434
+ *transforms : list
435
+ List of transforms to apply
436
+ weights : list
437
+ Probability of choosing any specific transform.
438
+ name : str, optional
439
+ Name of this transform, used to identify it in the dictionary
440
+ produced by ``self.instantiate``, by default None
441
+ prob : float, optional
442
+ Probability of applying this transform, by default 1.0
443
+
444
+ Examples
445
+ --------
446
+
447
+ >>> transforms.Choose(tfm.LowPass(), tfm.HighPass())
448
+ """
449
+
450
+ def __init__(
451
+ self,
452
+ *transforms: list,
453
+ weights: list = None,
454
+ name: str = None,
455
+ prob: float = 1.0,
456
+ ):
457
+ super().__init__(*transforms, name=name, prob=prob)
458
+
459
+ if weights is None:
460
+ _len = len(self.transforms)
461
+ weights = [1 / _len for _ in range(_len)]
462
+ self.weights = np.array(weights)
463
+
464
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
465
+ kwargs = super()._instantiate(state, signal)
466
+ tfm_idx = list(range(len(self.transforms)))
467
+ tfm_idx = state.choice(tfm_idx, p=self.weights)
468
+ one_hot = []
469
+ for i, t in enumerate(self.transforms):
470
+ mask = kwargs[t.name]["mask"]
471
+ if mask.item():
472
+ kwargs[t.name]["mask"] = tt(i == tfm_idx)
473
+ one_hot.append(kwargs[t.name]["mask"])
474
+ kwargs["one_hot"] = one_hot
475
+ return kwargs
476
+
477
+
478
+ class Repeat(Compose):
479
+ """Repeatedly applies a given transform ``n_repeat`` times."
480
+
481
+ Parameters
482
+ ----------
483
+ transform : BaseTransform
484
+ Transform to repeat.
485
+ n_repeat : int, optional
486
+ Number of times to repeat transform, by default 1
487
+ """
488
+
489
+ def __init__(
490
+ self,
491
+ transform,
492
+ n_repeat: int = 1,
493
+ name: str = None,
494
+ prob: float = 1.0,
495
+ ):
496
+ transforms = [copy.copy(transform) for _ in range(n_repeat)]
497
+ super().__init__(transforms, name=name, prob=prob)
498
+
499
+ self.n_repeat = n_repeat
500
+
501
+
502
+ class RepeatUpTo(Choose):
503
+ """Repeatedly applies a given transform up to ``max_repeat`` times."
504
+
505
+ Parameters
506
+ ----------
507
+ transform : BaseTransform
508
+ Transform to repeat.
509
+ max_repeat : int, optional
510
+ Max number of times to repeat transform, by default 1
511
+ weights : list
512
+ Probability of choosing any specific number up to ``max_repeat``.
513
+ """
514
+
515
+ def __init__(
516
+ self,
517
+ transform,
518
+ max_repeat: int = 5,
519
+ weights: list = None,
520
+ name: str = None,
521
+ prob: float = 1.0,
522
+ ):
523
+ transforms = []
524
+ for n in range(1, max_repeat):
525
+ transforms.append(Repeat(transform, n_repeat=n))
526
+ super().__init__(transforms, name=name, prob=prob, weights=weights)
527
+
528
+ self.max_repeat = max_repeat
529
+
530
+
531
+ class ClippingDistortion(BaseTransform):
532
+ """Adds clipping distortion to signal. Corresponds
533
+ to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`.
534
+
535
+ Parameters
536
+ ----------
537
+ perc : tuple, optional
538
+ Clipping percentile. Values are between 0.0 to 1.0.
539
+ Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1)
540
+ name : str, optional
541
+ Name of this transform, used to identify it in the dictionary
542
+ produced by ``self.instantiate``, by default None
543
+ prob : float, optional
544
+ Probability of applying this transform, by default 1.0
545
+ """
546
+
547
+ def __init__(
548
+ self,
549
+ perc: tuple = ("uniform", 0.0, 0.1),
550
+ name: str = None,
551
+ prob: float = 1.0,
552
+ ):
553
+ super().__init__(name=name, prob=prob)
554
+
555
+ self.perc = perc
556
+
557
+ def _instantiate(self, state: RandomState):
558
+ return {"perc": util.sample_from_dist(self.perc, state)}
559
+
560
+ def _transform(self, signal, perc):
561
+ return signal.clip_distortion(perc)
562
+
563
+
564
+ class Equalizer(BaseTransform):
565
+ """Applies an equalization curve to the audio signal. Corresponds
566
+ to :py:func:`audiotools.core.effects.EffectMixin.equalizer`.
567
+
568
+ Parameters
569
+ ----------
570
+ eq_amount : tuple, optional
571
+ The maximum dB cut to apply to the audio in any band,
572
+ by default ("const", 1.0 dB)
573
+ n_bands : int, optional
574
+ Number of bands in EQ, by default 6
575
+ name : str, optional
576
+ Name of this transform, used to identify it in the dictionary
577
+ produced by ``self.instantiate``, by default None
578
+ prob : float, optional
579
+ Probability of applying this transform, by default 1.0
580
+ """
581
+
582
+ def __init__(
583
+ self,
584
+ eq_amount: tuple = ("const", 1.0),
585
+ n_bands: int = 6,
586
+ name: str = None,
587
+ prob: float = 1.0,
588
+ ):
589
+ super().__init__(name=name, prob=prob)
590
+
591
+ self.eq_amount = eq_amount
592
+ self.n_bands = n_bands
593
+
594
+ def _instantiate(self, state: RandomState):
595
+ eq_amount = util.sample_from_dist(self.eq_amount, state)
596
+ eq = -eq_amount * state.rand(self.n_bands)
597
+ return {"eq": eq}
598
+
599
+ def _transform(self, signal, eq):
600
+ return signal.equalizer(eq)
601
+
602
+
603
+ class Quantization(BaseTransform):
604
+ """Applies quantization to the input waveform. Corresponds
605
+ to :py:func:`audiotools.core.effects.EffectMixin.quantization`.
606
+
607
+ Parameters
608
+ ----------
609
+ channels : tuple, optional
610
+ Number of evenly spaced quantization channels to quantize
611
+ to, by default ("choice", [8, 32, 128, 256, 1024])
612
+ name : str, optional
613
+ Name of this transform, used to identify it in the dictionary
614
+ produced by ``self.instantiate``, by default None
615
+ prob : float, optional
616
+ Probability of applying this transform, by default 1.0
617
+ """
618
+
619
+ def __init__(
620
+ self,
621
+ channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
622
+ name: str = None,
623
+ prob: float = 1.0,
624
+ ):
625
+ super().__init__(name=name, prob=prob)
626
+
627
+ self.channels = channels
628
+
629
+ def _instantiate(self, state: RandomState):
630
+ return {"channels": util.sample_from_dist(self.channels, state)}
631
+
632
+ def _transform(self, signal, channels):
633
+ return signal.quantization(channels)
634
+
635
+
636
+ class MuLawQuantization(BaseTransform):
637
+ """Applies mu-law quantization to the input waveform. Corresponds
638
+ to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`.
639
+
640
+ Parameters
641
+ ----------
642
+ channels : tuple, optional
643
+ Number of mu-law spaced quantization channels to quantize
644
+ to, by default ("choice", [8, 32, 128, 256, 1024])
645
+ name : str, optional
646
+ Name of this transform, used to identify it in the dictionary
647
+ produced by ``self.instantiate``, by default None
648
+ prob : float, optional
649
+ Probability of applying this transform, by default 1.0
650
+ """
651
+
652
+ def __init__(
653
+ self,
654
+ channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
655
+ name: str = None,
656
+ prob: float = 1.0,
657
+ ):
658
+ super().__init__(name=name, prob=prob)
659
+
660
+ self.channels = channels
661
+
662
+ def _instantiate(self, state: RandomState):
663
+ return {"channels": util.sample_from_dist(self.channels, state)}
664
+
665
+ def _transform(self, signal, channels):
666
+ return signal.mulaw_quantization(channels)
667
+
668
+
669
+ class NoiseFloor(BaseTransform):
670
+ """Adds a noise floor of Gaussian noise to the signal at a specified
671
+ dB.
672
+
673
+ Parameters
674
+ ----------
675
+ db : tuple, optional
676
+ Level of noise to add to signal, by default ("const", -50.0)
677
+ name : str, optional
678
+ Name of this transform, used to identify it in the dictionary
679
+ produced by ``self.instantiate``, by default None
680
+ prob : float, optional
681
+ Probability of applying this transform, by default 1.0
682
+ """
683
+
684
+ def __init__(
685
+ self,
686
+ db: tuple = ("const", -50.0),
687
+ name: str = None,
688
+ prob: float = 1.0,
689
+ ):
690
+ super().__init__(name=name, prob=prob)
691
+
692
+ self.db = db
693
+
694
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
695
+ db = util.sample_from_dist(self.db, state)
696
+ audio_data = state.randn(signal.num_channels, signal.signal_length)
697
+ nz_signal = AudioSignal(audio_data, signal.sample_rate)
698
+ nz_signal.normalize(db)
699
+ return {"nz_signal": nz_signal}
700
+
701
+ def _transform(self, signal, nz_signal):
702
+ # Clone bg_signal so that transform can be repeatedly applied
703
+ # to different signals with the same effect.
704
+ return signal + nz_signal
705
+
706
+
707
+ class BackgroundNoise(BaseTransform):
708
+ """Adds background noise from audio specified by a set of CSV files.
709
+ A valid CSV file looks like, and is typically generated by
710
+ :py:func:`audiotools.data.preprocess.create_csv`:
711
+
712
+ .. csv-table::
713
+ :header: path
714
+
715
+ room_tone/m6_script2_clean.wav
716
+ room_tone/m6_script2_cleanraw.wav
717
+ room_tone/m6_script2_ipad_balcony1.wav
718
+ room_tone/m6_script2_ipad_bedroom1.wav
719
+ room_tone/m6_script2_ipad_confroom1.wav
720
+ room_tone/m6_script2_ipad_confroom2.wav
721
+ room_tone/m6_script2_ipad_livingroom1.wav
722
+ room_tone/m6_script2_ipad_office1.wav
723
+
724
+ .. note::
725
+ All paths are relative to an environment variable called ``PATH_TO_DATA``,
726
+ so that CSV files are portable across machines where data may be
727
+ located in different places.
728
+
729
+ This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
730
+ and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the
731
+ hood.
732
+
733
+ Parameters
734
+ ----------
735
+ snr : tuple, optional
736
+ Signal-to-noise ratio, by default ("uniform", 10.0, 30.0)
737
+ sources : List[str], optional
738
+ Sources containing folders, or CSVs with paths to audio files,
739
+ by default None
740
+ weights : List[float], optional
741
+ Weights to sample audio files from each source, by default None
742
+ eq_amount : tuple, optional
743
+ Amount of equalization to apply, by default ("const", 1.0)
744
+ n_bands : int, optional
745
+ Number of bands in equalizer, by default 3
746
+ name : str, optional
747
+ Name of this transform, used to identify it in the dictionary
748
+ produced by ``self.instantiate``, by default None
749
+ prob : float, optional
750
+ Probability of applying this transform, by default 1.0
751
+ loudness_cutoff : float, optional
752
+ Loudness cutoff when loading from audio files, by default None
753
+ """
754
+
755
+ def __init__(
756
+ self,
757
+ snr: tuple = ("uniform", 10.0, 30.0),
758
+ sources: List[str] = None,
759
+ weights: List[float] = None,
760
+ eq_amount: tuple = ("const", 1.0),
761
+ n_bands: int = 3,
762
+ name: str = None,
763
+ prob: float = 1.0,
764
+ loudness_cutoff: float = None,
765
+ ):
766
+ super().__init__(name=name, prob=prob)
767
+
768
+ self.snr = snr
769
+ self.eq_amount = eq_amount
770
+ self.n_bands = n_bands
771
+ self.loader = AudioLoader(sources, weights)
772
+ self.loudness_cutoff = loudness_cutoff
773
+
774
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
775
+ eq_amount = util.sample_from_dist(self.eq_amount, state)
776
+ eq = -eq_amount * state.rand(self.n_bands)
777
+ snr = util.sample_from_dist(self.snr, state)
778
+
779
+ bg_signal = self.loader(
780
+ state,
781
+ signal.sample_rate,
782
+ duration=signal.signal_duration,
783
+ loudness_cutoff=self.loudness_cutoff,
784
+ num_channels=signal.num_channels,
785
+ )["signal"]
786
+
787
+ return {"eq": eq, "bg_signal": bg_signal, "snr": snr}
788
+
789
+ def _transform(self, signal, bg_signal, snr, eq):
790
+ # Clone bg_signal so that transform can be repeatedly applied
791
+ # to different signals with the same effect.
792
+ return signal.mix(bg_signal.clone(), snr, eq)
793
+
794
+
795
+ class CrossTalk(BaseTransform):
796
+ """Adds crosstalk between speakers, whose audio is drawn from a CSV file
797
+ that was produced via :py:func:`audiotools.data.preprocess.create_csv`.
798
+
799
+ This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
800
+ under the hood.
801
+
802
+ Parameters
803
+ ----------
804
+ snr : tuple, optional
805
+ How loud cross-talk speaker is relative to original signal in dB,
806
+ by default ("uniform", 0.0, 10.0)
807
+ sources : List[str], optional
808
+ Sources containing folders, or CSVs with paths to audio files,
809
+ by default None
810
+ weights : List[float], optional
811
+ Weights to sample audio files from each source, by default None
812
+ name : str, optional
813
+ Name of this transform, used to identify it in the dictionary
814
+ produced by ``self.instantiate``, by default None
815
+ prob : float, optional
816
+ Probability of applying this transform, by default 1.0
817
+ loudness_cutoff : float, optional
818
+ Loudness cutoff when loading from audio files, by default -40
819
+ """
820
+
821
+ def __init__(
822
+ self,
823
+ snr: tuple = ("uniform", 0.0, 10.0),
824
+ sources: List[str] = None,
825
+ weights: List[float] = None,
826
+ name: str = None,
827
+ prob: float = 1.0,
828
+ loudness_cutoff: float = -40,
829
+ ):
830
+ super().__init__(name=name, prob=prob)
831
+
832
+ self.snr = snr
833
+ self.loader = AudioLoader(sources, weights)
834
+ self.loudness_cutoff = loudness_cutoff
835
+
836
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
837
+ snr = util.sample_from_dist(self.snr, state)
838
+ crosstalk_signal = self.loader(
839
+ state,
840
+ signal.sample_rate,
841
+ duration=signal.signal_duration,
842
+ loudness_cutoff=self.loudness_cutoff,
843
+ num_channels=signal.num_channels,
844
+ )["signal"]
845
+
846
+ return {"crosstalk_signal": crosstalk_signal, "snr": snr}
847
+
848
+ def _transform(self, signal, crosstalk_signal, snr):
849
+ # Clone bg_signal so that transform can be repeatedly applied
850
+ # to different signals with the same effect.
851
+ loudness = signal.loudness()
852
+ mix = signal.mix(crosstalk_signal.clone(), snr)
853
+ mix.normalize(loudness)
854
+ return mix
855
+
856
+
857
+ class RoomImpulseResponse(BaseTransform):
858
+ """Convolves signal with a room impulse response, at a specified
859
+ direct-to-reverberant ratio, with equalization applied. Room impulse
860
+ response data is drawn from a CSV file that was produced via
861
+ :py:func:`audiotools.data.preprocess.create_csv`.
862
+
863
+ This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir`
864
+ under the hood.
865
+
866
+ Parameters
867
+ ----------
868
+ drr : tuple, optional
869
+ _description_, by default ("uniform", 0.0, 30.0)
870
+ sources : List[str], optional
871
+ Sources containing folders, or CSVs with paths to audio files,
872
+ by default None
873
+ weights : List[float], optional
874
+ Weights to sample audio files from each source, by default None
875
+ eq_amount : tuple, optional
876
+ Amount of equalization to apply, by default ("const", 1.0)
877
+ n_bands : int, optional
878
+ Number of bands in equalizer, by default 6
879
+ name : str, optional
880
+ Name of this transform, used to identify it in the dictionary
881
+ produced by ``self.instantiate``, by default None
882
+ prob : float, optional
883
+ Probability of applying this transform, by default 1.0
884
+ use_original_phase : bool, optional
885
+ Whether or not to use the original phase, by default False
886
+ offset : float, optional
887
+ Offset from each impulse response file to use, by default 0.0
888
+ duration : float, optional
889
+ Duration of each impulse response, by default 1.0
890
+ """
891
+
892
+ def __init__(
893
+ self,
894
+ drr: tuple = ("uniform", 0.0, 30.0),
895
+ sources: List[str] = None,
896
+ weights: List[float] = None,
897
+ eq_amount: tuple = ("const", 1.0),
898
+ n_bands: int = 6,
899
+ name: str = None,
900
+ prob: float = 1.0,
901
+ use_original_phase: bool = False,
902
+ offset: float = 0.0,
903
+ duration: float = 1.0,
904
+ ):
905
+ super().__init__(name=name, prob=prob)
906
+
907
+ self.drr = drr
908
+ self.eq_amount = eq_amount
909
+ self.n_bands = n_bands
910
+ self.use_original_phase = use_original_phase
911
+
912
+ self.loader = AudioLoader(sources, weights)
913
+ self.offset = offset
914
+ self.duration = duration
915
+
916
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
917
+ eq_amount = util.sample_from_dist(self.eq_amount, state)
918
+ eq = -eq_amount * state.rand(self.n_bands)
919
+ drr = util.sample_from_dist(self.drr, state)
920
+
921
+ ir_signal = self.loader(
922
+ state,
923
+ signal.sample_rate,
924
+ offset=self.offset,
925
+ duration=self.duration,
926
+ loudness_cutoff=None,
927
+ num_channels=signal.num_channels,
928
+ )["signal"]
929
+ ir_signal.zero_pad_to(signal.sample_rate)
930
+
931
+ return {"eq": eq, "ir_signal": ir_signal, "drr": drr}
932
+
933
+ def _transform(self, signal, ir_signal, drr, eq):
934
+ # Clone ir_signal so that transform can be repeatedly applied
935
+ # to different signals with the same effect.
936
+ return signal.apply_ir(
937
+ ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase
938
+ )
939
+
940
+
941
+ class VolumeChange(BaseTransform):
942
+ """Changes the volume of the input signal.
943
+
944
+ Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.
945
+
946
+ Parameters
947
+ ----------
948
+ db : tuple, optional
949
+ Change in volume in decibels, by default ("uniform", -12.0, 0.0)
950
+ name : str, optional
951
+ Name of this transform, used to identify it in the dictionary
952
+ produced by ``self.instantiate``, by default None
953
+ prob : float, optional
954
+ Probability of applying this transform, by default 1.0
955
+ """
956
+
957
+ def __init__(
958
+ self,
959
+ db: tuple = ("uniform", -12.0, 0.0),
960
+ name: str = None,
961
+ prob: float = 1.0,
962
+ ):
963
+ super().__init__(name=name, prob=prob)
964
+ self.db = db
965
+
966
+ def _instantiate(self, state: RandomState):
967
+ return {"db": util.sample_from_dist(self.db, state)}
968
+
969
+ def _transform(self, signal, db):
970
+ return signal.volume_change(db)
971
+
972
+
973
+ class VolumeNorm(BaseTransform):
974
+ """Normalizes the volume of the excerpt to a specified decibel.
975
+
976
+ Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`.
977
+
978
+ Parameters
979
+ ----------
980
+ db : tuple, optional
981
+ dB to normalize signal to, by default ("const", -24)
982
+ name : str, optional
983
+ Name of this transform, used to identify it in the dictionary
984
+ produced by ``self.instantiate``, by default None
985
+ prob : float, optional
986
+ Probability of applying this transform, by default 1.0
987
+ """
988
+
989
+ def __init__(
990
+ self,
991
+ db: tuple = ("const", -24),
992
+ name: str = None,
993
+ prob: float = 1.0,
994
+ ):
995
+ super().__init__(name=name, prob=prob)
996
+
997
+ self.db = db
998
+
999
+ def _instantiate(self, state: RandomState):
1000
+ return {"db": util.sample_from_dist(self.db, state)}
1001
+
1002
+ def _transform(self, signal, db):
1003
+ return signal.normalize(db)
1004
+
1005
+
1006
+ class GlobalVolumeNorm(BaseTransform):
1007
+ """Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this
1008
+ transform also normalizes the volume of a signal, but it uses
1009
+ the volume of the entire audio file the loaded excerpt comes from,
1010
+ rather than the volume of just the excerpt. The volume of the
1011
+ entire audio file is expected in ``signal.metadata["loudness"]``.
1012
+ If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv`
1013
+ with ``loudness = True``, like the following:
1014
+
1015
+ .. csv-table::
1016
+ :header: path,loudness
1017
+
1018
+ daps/produced/f1_script1_produced.wav,-16.299999237060547
1019
+ daps/produced/f1_script2_produced.wav,-16.600000381469727
1020
+ daps/produced/f1_script3_produced.wav,-17.299999237060547
1021
+ daps/produced/f1_script4_produced.wav,-16.100000381469727
1022
+ daps/produced/f1_script5_produced.wav,-16.700000762939453
1023
+ daps/produced/f3_script1_produced.wav,-16.5
1024
+
1025
+ The ``AudioLoader`` will automatically load the loudness column into
1026
+ the metadata of the signal.
1027
+
1028
+ Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.
1029
+
1030
+ Parameters
1031
+ ----------
1032
+ db : tuple, optional
1033
+ dB to normalize signal to, by default ("const", -24)
1034
+ name : str, optional
1035
+ Name of this transform, used to identify it in the dictionary
1036
+ produced by ``self.instantiate``, by default None
1037
+ prob : float, optional
1038
+ Probability of applying this transform, by default 1.0
1039
+ """
1040
+
1041
+ def __init__(
1042
+ self,
1043
+ db: tuple = ("const", -24),
1044
+ name: str = None,
1045
+ prob: float = 1.0,
1046
+ ):
1047
+ super().__init__(name=name, prob=prob)
1048
+
1049
+ self.db = db
1050
+
1051
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
1052
+ if "loudness" not in signal.metadata:
1053
+ db_change = 0.0
1054
+ elif float(signal.metadata["loudness"]) == float("-inf"):
1055
+ db_change = 0.0
1056
+ else:
1057
+ db = util.sample_from_dist(self.db, state)
1058
+ db_change = db - float(signal.metadata["loudness"])
1059
+
1060
+ return {"db": db_change}
1061
+
1062
+ def _transform(self, signal, db):
1063
+ return signal.volume_change(db)
1064
+
1065
+
1066
+ class Silence(BaseTransform):
1067
+ """Zeros out the signal with some probability.
1068
+
1069
+ Parameters
1070
+ ----------
1071
+ name : str, optional
1072
+ Name of this transform, used to identify it in the dictionary
1073
+ produced by ``self.instantiate``, by default None
1074
+ prob : float, optional
1075
+ Probability of applying this transform, by default 0.1
1076
+ """
1077
+
1078
+ def __init__(self, name: str = None, prob: float = 0.1):
1079
+ super().__init__(name=name, prob=prob)
1080
+
1081
+ def _transform(self, signal):
1082
+ _loudness = signal._loudness
1083
+ signal = AudioSignal(
1084
+ torch.zeros_like(signal.audio_data),
1085
+ sample_rate=signal.sample_rate,
1086
+ stft_params=signal.stft_params,
1087
+ )
1088
+ # So that the amound of noise added is as if it wasn't silenced.
1089
+ # TODO: improve this hack
1090
+ signal._loudness = _loudness
1091
+
1092
+ return signal
1093
+
1094
+
1095
+ class LowPass(BaseTransform):
1096
+ """Applies a LowPass filter.
1097
+
1098
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`.
1099
+
1100
+ Parameters
1101
+ ----------
1102
+ cutoff : tuple, optional
1103
+ Cutoff frequency distribution,
1104
+ by default ``("choice", [4000, 8000, 16000])``
1105
+ zeros : int, optional
1106
+ Number of zero-crossings in filter, argument to
1107
+ ``julius.LowPassFilters``, by default 51
1108
+ name : str, optional
1109
+ Name of this transform, used to identify it in the dictionary
1110
+ produced by ``self.instantiate``, by default None
1111
+ prob : float, optional
1112
+ Probability of applying this transform, by default 1.0
1113
+ """
1114
+
1115
+ def __init__(
1116
+ self,
1117
+ cutoff: tuple = ("choice", [4000, 8000, 16000]),
1118
+ zeros: int = 51,
1119
+ name: str = None,
1120
+ prob: float = 1,
1121
+ ):
1122
+ super().__init__(name=name, prob=prob)
1123
+
1124
+ self.cutoff = cutoff
1125
+ self.zeros = zeros
1126
+
1127
+ def _instantiate(self, state: RandomState):
1128
+ return {"cutoff": util.sample_from_dist(self.cutoff, state)}
1129
+
1130
+ def _transform(self, signal, cutoff):
1131
+ return signal.low_pass(cutoff, zeros=self.zeros)
1132
+
1133
+
1134
+ class HighPass(BaseTransform):
1135
+ """Applies a HighPass filter.
1136
+
1137
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`.
1138
+
1139
+ Parameters
1140
+ ----------
1141
+ cutoff : tuple, optional
1142
+ Cutoff frequency distribution,
1143
+ by default ``("choice", [50, 100, 250, 500, 1000])``
1144
+ zeros : int, optional
1145
+ Number of zero-crossings in filter, argument to
1146
+ ``julius.LowPassFilters``, by default 51
1147
+ name : str, optional
1148
+ Name of this transform, used to identify it in the dictionary
1149
+ produced by ``self.instantiate``, by default None
1150
+ prob : float, optional
1151
+ Probability of applying this transform, by default 1.0
1152
+ """
1153
+
1154
+ def __init__(
1155
+ self,
1156
+ cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]),
1157
+ zeros: int = 51,
1158
+ name: str = None,
1159
+ prob: float = 1,
1160
+ ):
1161
+ super().__init__(name=name, prob=prob)
1162
+
1163
+ self.cutoff = cutoff
1164
+ self.zeros = zeros
1165
+
1166
+ def _instantiate(self, state: RandomState):
1167
+ return {"cutoff": util.sample_from_dist(self.cutoff, state)}
1168
+
1169
+ def _transform(self, signal, cutoff):
1170
+ return signal.high_pass(cutoff, zeros=self.zeros)
1171
+
1172
+
1173
+ class RescaleAudio(BaseTransform):
1174
+ """Rescales the audio so it is in between ``-val`` and ``val``
1175
+ only if the original audio exceeds those bounds. Useful if
1176
+ transforms have caused the audio to clip.
1177
+
1178
+ Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`.
1179
+
1180
+ Parameters
1181
+ ----------
1182
+ val : float, optional
1183
+ Max absolute value of signal, by default 1.0
1184
+ name : str, optional
1185
+ Name of this transform, used to identify it in the dictionary
1186
+ produced by ``self.instantiate``, by default None
1187
+ prob : float, optional
1188
+ Probability of applying this transform, by default 1.0
1189
+ """
1190
+
1191
+ def __init__(self, val: float = 1.0, name: str = None, prob: float = 1):
1192
+ super().__init__(name=name, prob=prob)
1193
+
1194
+ self.val = val
1195
+
1196
+ def _transform(self, signal):
1197
+ return signal.ensure_max_of_audio(self.val)
1198
+
1199
+
1200
+ class ShiftPhase(SpectralTransform):
1201
+ """Shifts the phase of the audio.
1202
+
1203
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`.
1204
+
1205
+ Parameters
1206
+ ----------
1207
+ shift : tuple, optional
1208
+ How much to shift phase by, by default ("uniform", -np.pi, np.pi)
1209
+ name : str, optional
1210
+ Name of this transform, used to identify it in the dictionary
1211
+ produced by ``self.instantiate``, by default None
1212
+ prob : float, optional
1213
+ Probability of applying this transform, by default 1.0
1214
+ """
1215
+
1216
+ def __init__(
1217
+ self,
1218
+ shift: tuple = ("uniform", -np.pi, np.pi),
1219
+ name: str = None,
1220
+ prob: float = 1,
1221
+ ):
1222
+ super().__init__(name=name, prob=prob)
1223
+ self.shift = shift
1224
+
1225
+ def _instantiate(self, state: RandomState):
1226
+ return {"shift": util.sample_from_dist(self.shift, state)}
1227
+
1228
+ def _transform(self, signal, shift):
1229
+ return signal.shift_phase(shift)
1230
+
1231
+
1232
+ class InvertPhase(ShiftPhase):
1233
+ """Inverts the phase of the audio.
1234
+
1235
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`.
1236
+
1237
+ Parameters
1238
+ ----------
1239
+ name : str, optional
1240
+ Name of this transform, used to identify it in the dictionary
1241
+ produced by ``self.instantiate``, by default None
1242
+ prob : float, optional
1243
+ Probability of applying this transform, by default 1.0
1244
+ """
1245
+
1246
+ def __init__(self, name: str = None, prob: float = 1):
1247
+ super().__init__(shift=("const", np.pi), name=name, prob=prob)
1248
+
1249
+
1250
+ class CorruptPhase(SpectralTransform):
1251
+ """Corrupts the phase of the audio.
1252
+
1253
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`.
1254
+
1255
+ Parameters
1256
+ ----------
1257
+ scale : tuple, optional
1258
+ How much to corrupt phase by, by default ("uniform", 0, np.pi)
1259
+ name : str, optional
1260
+ Name of this transform, used to identify it in the dictionary
1261
+ produced by ``self.instantiate``, by default None
1262
+ prob : float, optional
1263
+ Probability of applying this transform, by default 1.0
1264
+ """
1265
+
1266
+ def __init__(
1267
+ self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1
1268
+ ):
1269
+ super().__init__(name=name, prob=prob)
1270
+ self.scale = scale
1271
+
1272
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
1273
+ scale = util.sample_from_dist(self.scale, state)
1274
+ corruption = state.normal(scale=scale, size=signal.phase.shape[1:])
1275
+ return {"corruption": corruption.astype("float32")}
1276
+
1277
+ def _transform(self, signal, corruption):
1278
+ return signal.shift_phase(shift=corruption)
1279
+
1280
+
1281
+ class FrequencyMask(SpectralTransform):
1282
+ """Masks a band of frequencies at a center frequency
1283
+ from the audio.
1284
+
1285
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`.
1286
+
1287
+ Parameters
1288
+ ----------
1289
+ f_center : tuple, optional
1290
+ Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
1291
+ f_width : tuple, optional
1292
+ Width of zero'd out band, by default ("const", 0.1)
1293
+ name : str, optional
1294
+ Name of this transform, used to identify it in the dictionary
1295
+ produced by ``self.instantiate``, by default None
1296
+ prob : float, optional
1297
+ Probability of applying this transform, by default 1.0
1298
+ """
1299
+
1300
+ def __init__(
1301
+ self,
1302
+ f_center: tuple = ("uniform", 0.0, 1.0),
1303
+ f_width: tuple = ("const", 0.1),
1304
+ name: str = None,
1305
+ prob: float = 1,
1306
+ ):
1307
+ super().__init__(name=name, prob=prob)
1308
+ self.f_center = f_center
1309
+ self.f_width = f_width
1310
+
1311
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
1312
+ f_center = util.sample_from_dist(self.f_center, state)
1313
+ f_width = util.sample_from_dist(self.f_width, state)
1314
+
1315
+ fmin = max(f_center - (f_width / 2), 0.0)
1316
+ fmax = min(f_center + (f_width / 2), 1.0)
1317
+
1318
+ fmin_hz = (signal.sample_rate / 2) * fmin
1319
+ fmax_hz = (signal.sample_rate / 2) * fmax
1320
+
1321
+ return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz}
1322
+
1323
+ def _transform(self, signal, fmin_hz: float, fmax_hz: float):
1324
+ return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
1325
+
1326
+
1327
+ class TimeMask(SpectralTransform):
1328
+ """Masks out contiguous time-steps from signal.
1329
+
1330
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`.
1331
+
1332
+ Parameters
1333
+ ----------
1334
+ t_center : tuple, optional
1335
+ Center time in terms of 0.0 and 1.0 (duration of signal),
1336
+ by default ("uniform", 0.0, 1.0)
1337
+ t_width : tuple, optional
1338
+ Width of dropped out portion, by default ("const", 0.025)
1339
+ name : str, optional
1340
+ Name of this transform, used to identify it in the dictionary
1341
+ produced by ``self.instantiate``, by default None
1342
+ prob : float, optional
1343
+ Probability of applying this transform, by default 1.0
1344
+ """
1345
+
1346
+ def __init__(
1347
+ self,
1348
+ t_center: tuple = ("uniform", 0.0, 1.0),
1349
+ t_width: tuple = ("const", 0.025),
1350
+ name: str = None,
1351
+ prob: float = 1,
1352
+ ):
1353
+ super().__init__(name=name, prob=prob)
1354
+ self.t_center = t_center
1355
+ self.t_width = t_width
1356
+
1357
+ def _instantiate(self, state: RandomState, signal: AudioSignal):
1358
+ t_center = util.sample_from_dist(self.t_center, state)
1359
+ t_width = util.sample_from_dist(self.t_width, state)
1360
+
1361
+ tmin = max(t_center - (t_width / 2), 0.0)
1362
+ tmax = min(t_center + (t_width / 2), 1.0)
1363
+
1364
+ tmin_s = signal.signal_duration * tmin
1365
+ tmax_s = signal.signal_duration * tmax
1366
+ return {"tmin_s": tmin_s, "tmax_s": tmax_s}
1367
+
1368
+ def _transform(self, signal, tmin_s: float, tmax_s: float):
1369
+ return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s)
1370
+
1371
+
1372
+ class MaskLowMagnitudes(SpectralTransform):
1373
+ """Masks low magnitude regions out of signal.
1374
+
1375
+ Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`.
1376
+
1377
+ Parameters
1378
+ ----------
1379
+ db_cutoff : tuple, optional
1380
+ Decibel value for which things below it will be masked away,
1381
+ by default ("uniform", -10, 10)
1382
+ name : str, optional
1383
+ Name of this transform, used to identify it in the dictionary
1384
+ produced by ``self.instantiate``, by default None
1385
+ prob : float, optional
1386
+ Probability of applying this transform, by default 1.0
1387
+ """
1388
+
1389
+ def __init__(
1390
+ self,
1391
+ db_cutoff: tuple = ("uniform", -10, 10),
1392
+ name: str = None,
1393
+ prob: float = 1,
1394
+ ):
1395
+ super().__init__(name=name, prob=prob)
1396
+ self.db_cutoff = db_cutoff
1397
+
1398
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
1399
+ return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)}
1400
+
1401
+ def _transform(self, signal, db_cutoff: float):
1402
+ return signal.mask_low_magnitudes(db_cutoff)
1403
+
1404
+
1405
+ class Smoothing(BaseTransform):
1406
+ """Convolves the signal with a smoothing window.
1407
+
1408
+ Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`.
1409
+
1410
+ Parameters
1411
+ ----------
1412
+ window_type : tuple, optional
1413
+ Type of window to use, by default ("const", "average")
1414
+ window_length : tuple, optional
1415
+ Length of smoothing window, by
1416
+ default ("choice", [8, 16, 32, 64, 128, 256, 512])
1417
+ name : str, optional
1418
+ Name of this transform, used to identify it in the dictionary
1419
+ produced by ``self.instantiate``, by default None
1420
+ prob : float, optional
1421
+ Probability of applying this transform, by default 1.0
1422
+ """
1423
+
1424
+ def __init__(
1425
+ self,
1426
+ window_type: tuple = ("const", "average"),
1427
+ window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]),
1428
+ name: str = None,
1429
+ prob: float = 1,
1430
+ ):
1431
+ super().__init__(name=name, prob=prob)
1432
+ self.window_type = window_type
1433
+ self.window_length = window_length
1434
+
1435
+ def _instantiate(self, state: RandomState, signal: AudioSignal = None):
1436
+ window_type = util.sample_from_dist(self.window_type, state)
1437
+ window_length = util.sample_from_dist(self.window_length, state)
1438
+ window = signal.get_window(
1439
+ window_type=window_type, window_length=window_length, device="cpu"
1440
+ )
1441
+ return {"window": AudioSignal(window, signal.sample_rate)}
1442
+
1443
+ def _transform(self, signal, window):
1444
+ sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values
1445
+ sscale[sscale == 0.0] = 1.0
1446
+
1447
+ out = signal.convolve(window)
1448
+
1449
+ oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values
1450
+ oscale[oscale == 0.0] = 1.0
1451
+
1452
+ out = out * (sscale / oscale)
1453
+ return out
1454
+
1455
+
1456
+ class TimeNoise(TimeMask):
1457
+ """Similar to :py:func:`audiotools.data.transforms.TimeMask`, but
1458
+ replaces with noise instead of zeros.
1459
+
1460
+ Parameters
1461
+ ----------
1462
+ t_center : tuple, optional
1463
+ Center time in terms of 0.0 and 1.0 (duration of signal),
1464
+ by default ("uniform", 0.0, 1.0)
1465
+ t_width : tuple, optional
1466
+ Width of dropped out portion, by default ("const", 0.025)
1467
+ name : str, optional
1468
+ Name of this transform, used to identify it in the dictionary
1469
+ produced by ``self.instantiate``, by default None
1470
+ prob : float, optional
1471
+ Probability of applying this transform, by default 1.0
1472
+ """
1473
+
1474
+ def __init__(
1475
+ self,
1476
+ t_center: tuple = ("uniform", 0.0, 1.0),
1477
+ t_width: tuple = ("const", 0.025),
1478
+ name: str = None,
1479
+ prob: float = 1,
1480
+ ):
1481
+ super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob)
1482
+
1483
+ def _transform(self, signal, tmin_s: float, tmax_s: float):
1484
+ signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0)
1485
+ mag, phase = signal.magnitude, signal.phase
1486
+
1487
+ mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
1488
+ mask = (mag == 0.0) * (phase == 0.0)
1489
+
1490
+ mag[mask] = mag_r[mask]
1491
+ phase[mask] = phase_r[mask]
1492
+
1493
+ signal.magnitude = mag
1494
+ signal.phase = phase
1495
+ return signal
1496
+
1497
+
1498
+ class FrequencyNoise(FrequencyMask):
1499
+ """Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but
1500
+ replaces with noise instead of zeros.
1501
+
1502
+ Parameters
1503
+ ----------
1504
+ f_center : tuple, optional
1505
+ Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
1506
+ f_width : tuple, optional
1507
+ Width of zero'd out band, by default ("const", 0.1)
1508
+ name : str, optional
1509
+ Name of this transform, used to identify it in the dictionary
1510
+ produced by ``self.instantiate``, by default None
1511
+ prob : float, optional
1512
+ Probability of applying this transform, by default 1.0
1513
+ """
1514
+
1515
+ def __init__(
1516
+ self,
1517
+ f_center: tuple = ("uniform", 0.0, 1.0),
1518
+ f_width: tuple = ("const", 0.1),
1519
+ name: str = None,
1520
+ prob: float = 1,
1521
+ ):
1522
+ super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob)
1523
+
1524
+ def _transform(self, signal, fmin_hz: float, fmax_hz: float):
1525
+ signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
1526
+ mag, phase = signal.magnitude, signal.phase
1527
+
1528
+ mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
1529
+ mask = (mag == 0.0) * (phase == 0.0)
1530
+
1531
+ mag[mask] = mag_r[mask]
1532
+ phase[mask] = phase_r[mask]
1533
+
1534
+ signal.magnitude = mag
1535
+ signal.phase = phase
1536
+ return signal
1537
+
1538
+
1539
+ class SpectralDenoising(Equalizer):
1540
+ """Applies denoising algorithm detailed in
1541
+ :py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`,
1542
+ using a randomly generated noise signal for denoising.
1543
+
1544
+ Parameters
1545
+ ----------
1546
+ eq_amount : tuple, optional
1547
+ Amount of eq to apply to noise signal, by default ("const", 1.0)
1548
+ denoise_amount : tuple, optional
1549
+ Amount to denoise by, by default ("uniform", 0.8, 1.0)
1550
+ nz_volume : float, optional
1551
+ Volume of noise to denoise with, by default -40
1552
+ n_bands : int, optional
1553
+ Number of bands in equalizer, by default 6
1554
+ n_freq : int, optional
1555
+ Number of frequency bins to smooth by, by default 3
1556
+ n_time : int, optional
1557
+ Number of time bins to smooth by, by default 5
1558
+ name : str, optional
1559
+ Name of this transform, used to identify it in the dictionary
1560
+ produced by ``self.instantiate``, by default None
1561
+ prob : float, optional
1562
+ Probability of applying this transform, by default 1.0
1563
+ """
1564
+
1565
+ def __init__(
1566
+ self,
1567
+ eq_amount: tuple = ("const", 1.0),
1568
+ denoise_amount: tuple = ("uniform", 0.8, 1.0),
1569
+ nz_volume: float = -40,
1570
+ n_bands: int = 6,
1571
+ n_freq: int = 3,
1572
+ n_time: int = 5,
1573
+ name: str = None,
1574
+ prob: float = 1,
1575
+ ):
1576
+ super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob)
1577
+
1578
+ self.nz_volume = nz_volume
1579
+ self.denoise_amount = denoise_amount
1580
+ self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time)
1581
+
1582
+ def _transform(self, signal, nz, eq, denoise_amount):
1583
+ nz = nz.normalize(self.nz_volume).equalizer(eq)
1584
+ self.spectral_gate = self.spectral_gate.to(signal.device)
1585
+ signal = self.spectral_gate(signal, nz, denoise_amount)
1586
+ return signal
1587
+
1588
+ def _instantiate(self, state: RandomState):
1589
+ kwargs = super()._instantiate(state)
1590
+ kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state)
1591
+ kwargs["nz"] = AudioSignal(state.randn(22050), 44100)
1592
+ return kwargs