xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +473 -31
- xinference/client/restful/async_restful_client.py +178 -8
- xinference/client/restful/restful_client.py +151 -3
- xinference/core/supervisor.py +99 -53
- xinference/core/worker.py +10 -0
- xinference/deploy/cmdline.py +15 -0
- xinference/model/audio/core.py +21 -6
- xinference/model/audio/indextts2.py +166 -0
- xinference/model/audio/model_spec.json +58 -21
- xinference/model/image/model_spec.json +159 -90
- xinference/model/image/stable_diffusion/core.py +13 -4
- xinference/model/llm/__init__.py +6 -2
- xinference/model/llm/llm_family.json +1299 -174
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +44 -11
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +129 -36
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
- xinference/model/llm/transformers/utils.py +23 -0
- xinference/model/llm/utils.py +48 -32
- xinference/model/llm/vllm/core.py +207 -72
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/__init__.py +10 -0
- xinference/thirdparty/audiotools/core/__init__.py +4 -0
- xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
- xinference/thirdparty/audiotools/core/display.py +194 -0
- xinference/thirdparty/audiotools/core/dsp.py +390 -0
- xinference/thirdparty/audiotools/core/effects.py +647 -0
- xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
- xinference/thirdparty/audiotools/core/loudness.py +320 -0
- xinference/thirdparty/audiotools/core/playback.py +252 -0
- xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
- xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
- xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
- xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
- xinference/thirdparty/audiotools/core/util.py +671 -0
- xinference/thirdparty/audiotools/core/whisper.py +97 -0
- xinference/thirdparty/audiotools/data/__init__.py +3 -0
- xinference/thirdparty/audiotools/data/datasets.py +517 -0
- xinference/thirdparty/audiotools/data/preprocess.py +81 -0
- xinference/thirdparty/audiotools/data/transforms.py +1592 -0
- xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
- xinference/thirdparty/audiotools/metrics/distance.py +131 -0
- xinference/thirdparty/audiotools/metrics/quality.py +159 -0
- xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
- xinference/thirdparty/audiotools/ml/__init__.py +5 -0
- xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
- xinference/thirdparty/audiotools/ml/decorators.py +440 -0
- xinference/thirdparty/audiotools/ml/experiment.py +90 -0
- xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
- xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
- xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
- xinference/thirdparty/audiotools/post.py +140 -0
- xinference/thirdparty/audiotools/preference.py +600 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
- xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
- xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
- xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
- xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
- xinference/thirdparty/indextts/__init__.py +0 -0
- xinference/thirdparty/indextts/cli.py +65 -0
- xinference/thirdparty/indextts/gpt/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
- xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
- xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
- xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
- xinference/thirdparty/indextts/gpt/model.py +713 -0
- xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
- xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
- xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
- xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
- xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
- xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
- xinference/thirdparty/indextts/infer.py +690 -0
- xinference/thirdparty/indextts/infer_v2.py +739 -0
- xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
- xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
- xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
- xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
- xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
- xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
- xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
- xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
- xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
- xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
- xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
- xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
- xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
- xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
- xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
- xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
- xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
- xinference/thirdparty/indextts/utils/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/arch_util.py +120 -0
- xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
- xinference/thirdparty/indextts/utils/common.py +121 -0
- xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
- xinference/thirdparty/indextts/utils/front.py +536 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
- xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
- xinference/thirdparty/indextts/utils/text_utils.py +41 -0
- xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
- xinference/thirdparty/indextts/utils/utils.py +93 -0
- xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
- xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
- xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
- xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/gradio/media_interface.py +66 -8
- xinference/ui/web/ui/build/asset-manifest.json +6 -6
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
- xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
- xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
- xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
- xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
- xinference/ui/web/ui/package-lock.json +0 -34
- xinference/ui/web/ui/package.json +0 -1
- xinference/ui/web/ui/src/locales/en.json +9 -3
- xinference/ui/web/ui/src/locales/ja.json +9 -3
- xinference/ui/web/ui/src/locales/ko.json +9 -3
- xinference/ui/web/ui/src/locales/zh.json +9 -3
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
- xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
- xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
- xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
- xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
- xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
- xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
- xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
- xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
- xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
- xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
- xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
- xinference/ui/web/ui/node_modules/select/bower.json +0 -13
- xinference/ui/web/ui/node_modules/select/package.json +0 -29
- xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1592 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from inspect import signature
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import torch
|
|
8
|
+
from flatten_dict import flatten
|
|
9
|
+
from flatten_dict import unflatten
|
|
10
|
+
from numpy.random import RandomState
|
|
11
|
+
|
|
12
|
+
from .. import ml
|
|
13
|
+
from ..core import AudioSignal
|
|
14
|
+
from ..core import util
|
|
15
|
+
from .datasets import AudioLoader
|
|
16
|
+
|
|
17
|
+
tt = torch.tensor
|
|
18
|
+
"""Shorthand for converting things to torch.tensor."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BaseTransform:
|
|
22
|
+
"""This is the base class for all transforms that are implemented
|
|
23
|
+
in this library. Transforms have two main operations: ``transform``
|
|
24
|
+
and ``instantiate``.
|
|
25
|
+
|
|
26
|
+
``instantiate`` sets the parameters randomly
|
|
27
|
+
from distribution tuples for each parameter. For example, for the
|
|
28
|
+
``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``)
|
|
29
|
+
is chosen randomly by instantiate. By default, it chosen uniformly
|
|
30
|
+
between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``).
|
|
31
|
+
|
|
32
|
+
``transform`` applies the transform using the instantiated parameters.
|
|
33
|
+
A simple example is as follows:
|
|
34
|
+
|
|
35
|
+
>>> seed = 0
|
|
36
|
+
>>> signal = ...
|
|
37
|
+
>>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0))
|
|
38
|
+
>>> kwargs = transform.instantiate()
|
|
39
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
40
|
+
|
|
41
|
+
By breaking apart the instantiation of parameters from the actual audio
|
|
42
|
+
processing of the transform, we can make things more reproducible, while
|
|
43
|
+
also applying the transform on batches of data efficiently on GPU,
|
|
44
|
+
rather than on individual audio samples.
|
|
45
|
+
|
|
46
|
+
.. note::
|
|
47
|
+
We call ``signal.clone()`` for the input to the ``transform`` function
|
|
48
|
+
because signals are modified in-place! If you don't clone the signal,
|
|
49
|
+
you will lose the original data.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
keys : list, optional
|
|
54
|
+
Keys that the transform looks for when
|
|
55
|
+
calling ``self.transform``, by default []. In general this is
|
|
56
|
+
set automatically, and you won't need to manipulate this argument.
|
|
57
|
+
name : str, optional
|
|
58
|
+
Name of this transform, used to identify it in the dictionary
|
|
59
|
+
produced by ``self.instantiate``, by default None
|
|
60
|
+
prob : float, optional
|
|
61
|
+
Probability of applying this transform, by default 1.0
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
|
|
66
|
+
>>> seed = 0
|
|
67
|
+
>>>
|
|
68
|
+
>>> audio_path = "tests/audio/spk/f10_script4_produced.wav"
|
|
69
|
+
>>> signal = AudioSignal(audio_path, offset=10, duration=2)
|
|
70
|
+
>>> transform = tfm.Compose(
|
|
71
|
+
>>> [
|
|
72
|
+
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
|
|
73
|
+
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
|
|
74
|
+
>>> ],
|
|
75
|
+
>>> )
|
|
76
|
+
>>>
|
|
77
|
+
>>> kwargs = transform.instantiate(seed, signal)
|
|
78
|
+
>>> output = transform(signal, **kwargs)
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(self, keys: list = [], name: str = None, prob: float = 1.0):
|
|
83
|
+
# Get keys from the _transform signature.
|
|
84
|
+
tfm_keys = list(signature(self._transform).parameters.keys())
|
|
85
|
+
|
|
86
|
+
# Filter out signal and kwargs keys.
|
|
87
|
+
ignore_keys = ["signal", "kwargs"]
|
|
88
|
+
tfm_keys = [k for k in tfm_keys if k not in ignore_keys]
|
|
89
|
+
|
|
90
|
+
# Combine keys specified by the child class, the keys found in
|
|
91
|
+
# _transform signature, and the mask key.
|
|
92
|
+
self.keys = keys + tfm_keys + ["mask"]
|
|
93
|
+
|
|
94
|
+
self.prob = prob
|
|
95
|
+
|
|
96
|
+
if name is None:
|
|
97
|
+
name = self.__class__.__name__
|
|
98
|
+
self.name = name
|
|
99
|
+
|
|
100
|
+
def _prepare(self, batch: dict):
|
|
101
|
+
sub_batch = batch[self.name]
|
|
102
|
+
|
|
103
|
+
for k in self.keys:
|
|
104
|
+
assert k in sub_batch.keys(), f"{k} not in batch"
|
|
105
|
+
|
|
106
|
+
return sub_batch
|
|
107
|
+
|
|
108
|
+
def _transform(self, signal):
|
|
109
|
+
return signal
|
|
110
|
+
|
|
111
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
112
|
+
return {}
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def apply_mask(batch: dict, mask: torch.Tensor):
|
|
116
|
+
"""Applies a mask to the batch.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
batch : dict
|
|
121
|
+
Batch whose values will be masked in the ``transform`` pass.
|
|
122
|
+
mask : torch.Tensor
|
|
123
|
+
Mask to apply to batch.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
dict
|
|
128
|
+
A dictionary that contains values only where ``mask = True``.
|
|
129
|
+
"""
|
|
130
|
+
masked_batch = {k: v[mask] for k, v in flatten(batch).items()}
|
|
131
|
+
return unflatten(masked_batch)
|
|
132
|
+
|
|
133
|
+
def transform(self, signal: AudioSignal, **kwargs):
|
|
134
|
+
"""Apply the transform to the audio signal,
|
|
135
|
+
with given keyword arguments.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
signal : AudioSignal
|
|
140
|
+
Signal that will be modified by the transforms in-place.
|
|
141
|
+
kwargs: dict
|
|
142
|
+
Keyword arguments to the specific transforms ``self._transform``
|
|
143
|
+
function.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
AudioSignal
|
|
148
|
+
Transformed AudioSignal.
|
|
149
|
+
|
|
150
|
+
Examples
|
|
151
|
+
--------
|
|
152
|
+
|
|
153
|
+
>>> for seed in range(10):
|
|
154
|
+
>>> kwargs = transform.instantiate(seed, signal)
|
|
155
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
tfm_kwargs = self._prepare(kwargs)
|
|
159
|
+
mask = tfm_kwargs["mask"]
|
|
160
|
+
|
|
161
|
+
if torch.any(mask):
|
|
162
|
+
tfm_kwargs = self.apply_mask(tfm_kwargs, mask)
|
|
163
|
+
tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"}
|
|
164
|
+
signal[mask] = self._transform(signal[mask], **tfm_kwargs)
|
|
165
|
+
|
|
166
|
+
return signal
|
|
167
|
+
|
|
168
|
+
def __call__(self, *args, **kwargs):
|
|
169
|
+
return self.transform(*args, **kwargs)
|
|
170
|
+
|
|
171
|
+
def instantiate(
|
|
172
|
+
self,
|
|
173
|
+
state: RandomState = None,
|
|
174
|
+
signal: AudioSignal = None,
|
|
175
|
+
):
|
|
176
|
+
"""Instantiates parameters for the transform.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
state : RandomState, optional
|
|
181
|
+
_description_, by default None
|
|
182
|
+
signal : AudioSignal, optional
|
|
183
|
+
_description_, by default None
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
dict
|
|
188
|
+
Dictionary containing instantiated arguments for every keyword
|
|
189
|
+
argument to ``self._transform``.
|
|
190
|
+
|
|
191
|
+
Examples
|
|
192
|
+
--------
|
|
193
|
+
|
|
194
|
+
>>> for seed in range(10):
|
|
195
|
+
>>> kwargs = transform.instantiate(seed, signal)
|
|
196
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
197
|
+
|
|
198
|
+
"""
|
|
199
|
+
state = util.random_state(state)
|
|
200
|
+
|
|
201
|
+
# Not all instantiates need the signal. Check if signal
|
|
202
|
+
# is needed before passing it in, so that the end-user
|
|
203
|
+
# doesn't need to have variables they're not using flowing
|
|
204
|
+
# into their function.
|
|
205
|
+
needs_signal = "signal" in set(signature(self._instantiate).parameters.keys())
|
|
206
|
+
kwargs = {}
|
|
207
|
+
if needs_signal:
|
|
208
|
+
kwargs = {"signal": signal}
|
|
209
|
+
|
|
210
|
+
# Instantiate the parameters for the transform.
|
|
211
|
+
params = self._instantiate(state, **kwargs)
|
|
212
|
+
for k in list(params.keys()):
|
|
213
|
+
v = params[k]
|
|
214
|
+
if isinstance(v, (AudioSignal, torch.Tensor, dict)):
|
|
215
|
+
params[k] = v
|
|
216
|
+
else:
|
|
217
|
+
params[k] = tt(v)
|
|
218
|
+
mask = state.rand() <= self.prob
|
|
219
|
+
params[f"mask"] = tt(mask)
|
|
220
|
+
|
|
221
|
+
# Put the params into a nested dictionary that will be
|
|
222
|
+
# used later when calling the transform. This is to avoid
|
|
223
|
+
# collisions in the dictionary.
|
|
224
|
+
params = {self.name: params}
|
|
225
|
+
|
|
226
|
+
return params
|
|
227
|
+
|
|
228
|
+
def batch_instantiate(
|
|
229
|
+
self,
|
|
230
|
+
states: list = None,
|
|
231
|
+
signal: AudioSignal = None,
|
|
232
|
+
):
|
|
233
|
+
"""Instantiates arguments for every item in a batch,
|
|
234
|
+
given a list of states. Each state in the list
|
|
235
|
+
corresponds to one item in the batch.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
states : list, optional
|
|
240
|
+
List of states, by default None
|
|
241
|
+
signal : AudioSignal, optional
|
|
242
|
+
AudioSignal to pass to the ``self.instantiate`` section
|
|
243
|
+
if it is needed for this transform, by default None
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
dict
|
|
248
|
+
Collated dictionary of arguments.
|
|
249
|
+
|
|
250
|
+
Examples
|
|
251
|
+
--------
|
|
252
|
+
|
|
253
|
+
>>> batch_size = 4
|
|
254
|
+
>>> signal = AudioSignal(audio_path, offset=10, duration=2)
|
|
255
|
+
>>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)])
|
|
256
|
+
>>>
|
|
257
|
+
>>> states = [seed + idx for idx in list(range(batch_size))]
|
|
258
|
+
>>> kwargs = transform.batch_instantiate(states, signal_batch)
|
|
259
|
+
>>> batch_output = transform(signal_batch, **kwargs)
|
|
260
|
+
"""
|
|
261
|
+
kwargs = []
|
|
262
|
+
for state in states:
|
|
263
|
+
kwargs.append(self.instantiate(state, signal))
|
|
264
|
+
kwargs = util.collate(kwargs)
|
|
265
|
+
return kwargs
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class Identity(BaseTransform):
|
|
269
|
+
"""This transform just returns the original signal."""
|
|
270
|
+
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class SpectralTransform(BaseTransform):
|
|
275
|
+
"""Spectral transforms require STFT data to exist, since manipulations
|
|
276
|
+
of the STFT require the spectrogram. This just calls ``stft`` before
|
|
277
|
+
the transform is called, and calls ``istft`` after the transform is
|
|
278
|
+
called so that the audio data is written to after the spectral
|
|
279
|
+
manipulation.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def transform(self, signal, **kwargs):
|
|
283
|
+
signal.stft()
|
|
284
|
+
super().transform(signal, **kwargs)
|
|
285
|
+
signal.istft()
|
|
286
|
+
return signal
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class Compose(BaseTransform):
|
|
290
|
+
"""Compose applies transforms in sequence, one after the other. The
|
|
291
|
+
transforms are passed in as positional arguments or as a list like so:
|
|
292
|
+
|
|
293
|
+
>>> transform = tfm.Compose(
|
|
294
|
+
>>> [
|
|
295
|
+
>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
|
|
296
|
+
>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
|
|
297
|
+
>>> ],
|
|
298
|
+
>>> )
|
|
299
|
+
|
|
300
|
+
This will convolve the signal with a room impulse response, and then
|
|
301
|
+
add background noise to the signal. Instantiate instantiates
|
|
302
|
+
all the parameters for every transform in the transform list so the
|
|
303
|
+
interface for using the Compose transform is the same as everything
|
|
304
|
+
else:
|
|
305
|
+
|
|
306
|
+
>>> kwargs = transform.instantiate()
|
|
307
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
308
|
+
|
|
309
|
+
Under the hood, the transform maps each transform to a unique name
|
|
310
|
+
under the hood of the form ``{position}.{name}``, where ``position``
|
|
311
|
+
is the index of the transform in the list. ``Compose`` can nest
|
|
312
|
+
within other ``Compose`` transforms, like so:
|
|
313
|
+
|
|
314
|
+
>>> preprocess = transforms.Compose(
|
|
315
|
+
>>> tfm.GlobalVolumeNorm(),
|
|
316
|
+
>>> tfm.CrossTalk(),
|
|
317
|
+
>>> name="preprocess",
|
|
318
|
+
>>> )
|
|
319
|
+
>>> augment = transforms.Compose(
|
|
320
|
+
>>> tfm.RoomImpulseResponse(),
|
|
321
|
+
>>> tfm.BackgroundNoise(),
|
|
322
|
+
>>> name="augment",
|
|
323
|
+
>>> )
|
|
324
|
+
>>> postprocess = transforms.Compose(
|
|
325
|
+
>>> tfm.VolumeChange(),
|
|
326
|
+
>>> tfm.RescaleAudio(),
|
|
327
|
+
>>> tfm.ShiftPhase(),
|
|
328
|
+
>>> name="postprocess",
|
|
329
|
+
>>> )
|
|
330
|
+
>>> transform = transforms.Compose(preprocess, augment, postprocess),
|
|
331
|
+
|
|
332
|
+
This defines 3 composed transforms, and then composes them in sequence
|
|
333
|
+
with one another.
|
|
334
|
+
|
|
335
|
+
Parameters
|
|
336
|
+
----------
|
|
337
|
+
*transforms : list
|
|
338
|
+
List of transforms to apply
|
|
339
|
+
name : str, optional
|
|
340
|
+
Name of this transform, used to identify it in the dictionary
|
|
341
|
+
produced by ``self.instantiate``, by default None
|
|
342
|
+
prob : float, optional
|
|
343
|
+
Probability of applying this transform, by default 1.0
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
def __init__(self, *transforms: list, name: str = None, prob: float = 1.0):
|
|
347
|
+
if isinstance(transforms[0], list):
|
|
348
|
+
transforms = transforms[0]
|
|
349
|
+
|
|
350
|
+
for i, tfm in enumerate(transforms):
|
|
351
|
+
tfm.name = f"{i}.{tfm.name}"
|
|
352
|
+
|
|
353
|
+
keys = [tfm.name for tfm in transforms]
|
|
354
|
+
super().__init__(keys=keys, name=name, prob=prob)
|
|
355
|
+
|
|
356
|
+
self.transforms = transforms
|
|
357
|
+
self.transforms_to_apply = keys
|
|
358
|
+
|
|
359
|
+
@contextmanager
|
|
360
|
+
def filter(self, *names: list):
|
|
361
|
+
"""This can be used to skip transforms entirely when applying
|
|
362
|
+
the sequence of transforms to a signal. For example, take
|
|
363
|
+
the following transforms with the names ``preprocess, augment, postprocess``.
|
|
364
|
+
|
|
365
|
+
>>> preprocess = transforms.Compose(
|
|
366
|
+
>>> tfm.GlobalVolumeNorm(),
|
|
367
|
+
>>> tfm.CrossTalk(),
|
|
368
|
+
>>> name="preprocess",
|
|
369
|
+
>>> )
|
|
370
|
+
>>> augment = transforms.Compose(
|
|
371
|
+
>>> tfm.RoomImpulseResponse(),
|
|
372
|
+
>>> tfm.BackgroundNoise(),
|
|
373
|
+
>>> name="augment",
|
|
374
|
+
>>> )
|
|
375
|
+
>>> postprocess = transforms.Compose(
|
|
376
|
+
>>> tfm.VolumeChange(),
|
|
377
|
+
>>> tfm.RescaleAudio(),
|
|
378
|
+
>>> tfm.ShiftPhase(),
|
|
379
|
+
>>> name="postprocess",
|
|
380
|
+
>>> )
|
|
381
|
+
>>> transform = transforms.Compose(preprocess, augment, postprocess)
|
|
382
|
+
|
|
383
|
+
If we wanted to apply all 3 to a signal, we do:
|
|
384
|
+
|
|
385
|
+
>>> kwargs = transform.instantiate()
|
|
386
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
387
|
+
|
|
388
|
+
But if we only wanted to apply the ``preprocess`` and ``postprocess``
|
|
389
|
+
transforms to the signal, we do:
|
|
390
|
+
|
|
391
|
+
>>> with transform_fn.filter("preprocess", "postprocess"):
|
|
392
|
+
>>> output = transform(signal.clone(), **kwargs)
|
|
393
|
+
|
|
394
|
+
Parameters
|
|
395
|
+
----------
|
|
396
|
+
*names : list
|
|
397
|
+
List of transforms, identified by name, to apply to signal.
|
|
398
|
+
"""
|
|
399
|
+
old_transforms = self.transforms_to_apply
|
|
400
|
+
self.transforms_to_apply = names
|
|
401
|
+
yield
|
|
402
|
+
self.transforms_to_apply = old_transforms
|
|
403
|
+
|
|
404
|
+
def _transform(self, signal, **kwargs):
|
|
405
|
+
for transform in self.transforms:
|
|
406
|
+
if any([x in transform.name for x in self.transforms_to_apply]):
|
|
407
|
+
signal = transform(signal, **kwargs)
|
|
408
|
+
return signal
|
|
409
|
+
|
|
410
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
411
|
+
parameters = {}
|
|
412
|
+
for transform in self.transforms:
|
|
413
|
+
parameters.update(transform.instantiate(state, signal=signal))
|
|
414
|
+
return parameters
|
|
415
|
+
|
|
416
|
+
def __getitem__(self, idx):
|
|
417
|
+
return self.transforms[idx]
|
|
418
|
+
|
|
419
|
+
def __len__(self):
|
|
420
|
+
return len(self.transforms)
|
|
421
|
+
|
|
422
|
+
def __iter__(self):
|
|
423
|
+
for transform in self.transforms:
|
|
424
|
+
yield transform
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
class Choose(Compose):
|
|
428
|
+
"""Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`,
|
|
429
|
+
but instead of applying all the transforms in sequence, it applies just a single transform,
|
|
430
|
+
which is chosen for each item in the batch.
|
|
431
|
+
|
|
432
|
+
Parameters
|
|
433
|
+
----------
|
|
434
|
+
*transforms : list
|
|
435
|
+
List of transforms to apply
|
|
436
|
+
weights : list
|
|
437
|
+
Probability of choosing any specific transform.
|
|
438
|
+
name : str, optional
|
|
439
|
+
Name of this transform, used to identify it in the dictionary
|
|
440
|
+
produced by ``self.instantiate``, by default None
|
|
441
|
+
prob : float, optional
|
|
442
|
+
Probability of applying this transform, by default 1.0
|
|
443
|
+
|
|
444
|
+
Examples
|
|
445
|
+
--------
|
|
446
|
+
|
|
447
|
+
>>> transforms.Choose(tfm.LowPass(), tfm.HighPass())
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
def __init__(
|
|
451
|
+
self,
|
|
452
|
+
*transforms: list,
|
|
453
|
+
weights: list = None,
|
|
454
|
+
name: str = None,
|
|
455
|
+
prob: float = 1.0,
|
|
456
|
+
):
|
|
457
|
+
super().__init__(*transforms, name=name, prob=prob)
|
|
458
|
+
|
|
459
|
+
if weights is None:
|
|
460
|
+
_len = len(self.transforms)
|
|
461
|
+
weights = [1 / _len for _ in range(_len)]
|
|
462
|
+
self.weights = np.array(weights)
|
|
463
|
+
|
|
464
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
465
|
+
kwargs = super()._instantiate(state, signal)
|
|
466
|
+
tfm_idx = list(range(len(self.transforms)))
|
|
467
|
+
tfm_idx = state.choice(tfm_idx, p=self.weights)
|
|
468
|
+
one_hot = []
|
|
469
|
+
for i, t in enumerate(self.transforms):
|
|
470
|
+
mask = kwargs[t.name]["mask"]
|
|
471
|
+
if mask.item():
|
|
472
|
+
kwargs[t.name]["mask"] = tt(i == tfm_idx)
|
|
473
|
+
one_hot.append(kwargs[t.name]["mask"])
|
|
474
|
+
kwargs["one_hot"] = one_hot
|
|
475
|
+
return kwargs
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class Repeat(Compose):
|
|
479
|
+
"""Repeatedly applies a given transform ``n_repeat`` times."
|
|
480
|
+
|
|
481
|
+
Parameters
|
|
482
|
+
----------
|
|
483
|
+
transform : BaseTransform
|
|
484
|
+
Transform to repeat.
|
|
485
|
+
n_repeat : int, optional
|
|
486
|
+
Number of times to repeat transform, by default 1
|
|
487
|
+
"""
|
|
488
|
+
|
|
489
|
+
def __init__(
|
|
490
|
+
self,
|
|
491
|
+
transform,
|
|
492
|
+
n_repeat: int = 1,
|
|
493
|
+
name: str = None,
|
|
494
|
+
prob: float = 1.0,
|
|
495
|
+
):
|
|
496
|
+
transforms = [copy.copy(transform) for _ in range(n_repeat)]
|
|
497
|
+
super().__init__(transforms, name=name, prob=prob)
|
|
498
|
+
|
|
499
|
+
self.n_repeat = n_repeat
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class RepeatUpTo(Choose):
|
|
503
|
+
"""Repeatedly applies a given transform up to ``max_repeat`` times."
|
|
504
|
+
|
|
505
|
+
Parameters
|
|
506
|
+
----------
|
|
507
|
+
transform : BaseTransform
|
|
508
|
+
Transform to repeat.
|
|
509
|
+
max_repeat : int, optional
|
|
510
|
+
Max number of times to repeat transform, by default 1
|
|
511
|
+
weights : list
|
|
512
|
+
Probability of choosing any specific number up to ``max_repeat``.
|
|
513
|
+
"""
|
|
514
|
+
|
|
515
|
+
def __init__(
|
|
516
|
+
self,
|
|
517
|
+
transform,
|
|
518
|
+
max_repeat: int = 5,
|
|
519
|
+
weights: list = None,
|
|
520
|
+
name: str = None,
|
|
521
|
+
prob: float = 1.0,
|
|
522
|
+
):
|
|
523
|
+
transforms = []
|
|
524
|
+
for n in range(1, max_repeat):
|
|
525
|
+
transforms.append(Repeat(transform, n_repeat=n))
|
|
526
|
+
super().__init__(transforms, name=name, prob=prob, weights=weights)
|
|
527
|
+
|
|
528
|
+
self.max_repeat = max_repeat
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
class ClippingDistortion(BaseTransform):
|
|
532
|
+
"""Adds clipping distortion to signal. Corresponds
|
|
533
|
+
to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`.
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
----------
|
|
537
|
+
perc : tuple, optional
|
|
538
|
+
Clipping percentile. Values are between 0.0 to 1.0.
|
|
539
|
+
Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1)
|
|
540
|
+
name : str, optional
|
|
541
|
+
Name of this transform, used to identify it in the dictionary
|
|
542
|
+
produced by ``self.instantiate``, by default None
|
|
543
|
+
prob : float, optional
|
|
544
|
+
Probability of applying this transform, by default 1.0
|
|
545
|
+
"""
|
|
546
|
+
|
|
547
|
+
def __init__(
|
|
548
|
+
self,
|
|
549
|
+
perc: tuple = ("uniform", 0.0, 0.1),
|
|
550
|
+
name: str = None,
|
|
551
|
+
prob: float = 1.0,
|
|
552
|
+
):
|
|
553
|
+
super().__init__(name=name, prob=prob)
|
|
554
|
+
|
|
555
|
+
self.perc = perc
|
|
556
|
+
|
|
557
|
+
def _instantiate(self, state: RandomState):
|
|
558
|
+
return {"perc": util.sample_from_dist(self.perc, state)}
|
|
559
|
+
|
|
560
|
+
def _transform(self, signal, perc):
|
|
561
|
+
return signal.clip_distortion(perc)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
class Equalizer(BaseTransform):
|
|
565
|
+
"""Applies an equalization curve to the audio signal. Corresponds
|
|
566
|
+
to :py:func:`audiotools.core.effects.EffectMixin.equalizer`.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
eq_amount : tuple, optional
|
|
571
|
+
The maximum dB cut to apply to the audio in any band,
|
|
572
|
+
by default ("const", 1.0 dB)
|
|
573
|
+
n_bands : int, optional
|
|
574
|
+
Number of bands in EQ, by default 6
|
|
575
|
+
name : str, optional
|
|
576
|
+
Name of this transform, used to identify it in the dictionary
|
|
577
|
+
produced by ``self.instantiate``, by default None
|
|
578
|
+
prob : float, optional
|
|
579
|
+
Probability of applying this transform, by default 1.0
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
def __init__(
|
|
583
|
+
self,
|
|
584
|
+
eq_amount: tuple = ("const", 1.0),
|
|
585
|
+
n_bands: int = 6,
|
|
586
|
+
name: str = None,
|
|
587
|
+
prob: float = 1.0,
|
|
588
|
+
):
|
|
589
|
+
super().__init__(name=name, prob=prob)
|
|
590
|
+
|
|
591
|
+
self.eq_amount = eq_amount
|
|
592
|
+
self.n_bands = n_bands
|
|
593
|
+
|
|
594
|
+
def _instantiate(self, state: RandomState):
|
|
595
|
+
eq_amount = util.sample_from_dist(self.eq_amount, state)
|
|
596
|
+
eq = -eq_amount * state.rand(self.n_bands)
|
|
597
|
+
return {"eq": eq}
|
|
598
|
+
|
|
599
|
+
def _transform(self, signal, eq):
|
|
600
|
+
return signal.equalizer(eq)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
class Quantization(BaseTransform):
|
|
604
|
+
"""Applies quantization to the input waveform. Corresponds
|
|
605
|
+
to :py:func:`audiotools.core.effects.EffectMixin.quantization`.
|
|
606
|
+
|
|
607
|
+
Parameters
|
|
608
|
+
----------
|
|
609
|
+
channels : tuple, optional
|
|
610
|
+
Number of evenly spaced quantization channels to quantize
|
|
611
|
+
to, by default ("choice", [8, 32, 128, 256, 1024])
|
|
612
|
+
name : str, optional
|
|
613
|
+
Name of this transform, used to identify it in the dictionary
|
|
614
|
+
produced by ``self.instantiate``, by default None
|
|
615
|
+
prob : float, optional
|
|
616
|
+
Probability of applying this transform, by default 1.0
|
|
617
|
+
"""
|
|
618
|
+
|
|
619
|
+
def __init__(
|
|
620
|
+
self,
|
|
621
|
+
channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
|
|
622
|
+
name: str = None,
|
|
623
|
+
prob: float = 1.0,
|
|
624
|
+
):
|
|
625
|
+
super().__init__(name=name, prob=prob)
|
|
626
|
+
|
|
627
|
+
self.channels = channels
|
|
628
|
+
|
|
629
|
+
def _instantiate(self, state: RandomState):
|
|
630
|
+
return {"channels": util.sample_from_dist(self.channels, state)}
|
|
631
|
+
|
|
632
|
+
def _transform(self, signal, channels):
|
|
633
|
+
return signal.quantization(channels)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
class MuLawQuantization(BaseTransform):
|
|
637
|
+
"""Applies mu-law quantization to the input waveform. Corresponds
|
|
638
|
+
to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`.
|
|
639
|
+
|
|
640
|
+
Parameters
|
|
641
|
+
----------
|
|
642
|
+
channels : tuple, optional
|
|
643
|
+
Number of mu-law spaced quantization channels to quantize
|
|
644
|
+
to, by default ("choice", [8, 32, 128, 256, 1024])
|
|
645
|
+
name : str, optional
|
|
646
|
+
Name of this transform, used to identify it in the dictionary
|
|
647
|
+
produced by ``self.instantiate``, by default None
|
|
648
|
+
prob : float, optional
|
|
649
|
+
Probability of applying this transform, by default 1.0
|
|
650
|
+
"""
|
|
651
|
+
|
|
652
|
+
def __init__(
|
|
653
|
+
self,
|
|
654
|
+
channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
|
|
655
|
+
name: str = None,
|
|
656
|
+
prob: float = 1.0,
|
|
657
|
+
):
|
|
658
|
+
super().__init__(name=name, prob=prob)
|
|
659
|
+
|
|
660
|
+
self.channels = channels
|
|
661
|
+
|
|
662
|
+
def _instantiate(self, state: RandomState):
|
|
663
|
+
return {"channels": util.sample_from_dist(self.channels, state)}
|
|
664
|
+
|
|
665
|
+
def _transform(self, signal, channels):
|
|
666
|
+
return signal.mulaw_quantization(channels)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class NoiseFloor(BaseTransform):
|
|
670
|
+
"""Adds a noise floor of Gaussian noise to the signal at a specified
|
|
671
|
+
dB.
|
|
672
|
+
|
|
673
|
+
Parameters
|
|
674
|
+
----------
|
|
675
|
+
db : tuple, optional
|
|
676
|
+
Level of noise to add to signal, by default ("const", -50.0)
|
|
677
|
+
name : str, optional
|
|
678
|
+
Name of this transform, used to identify it in the dictionary
|
|
679
|
+
produced by ``self.instantiate``, by default None
|
|
680
|
+
prob : float, optional
|
|
681
|
+
Probability of applying this transform, by default 1.0
|
|
682
|
+
"""
|
|
683
|
+
|
|
684
|
+
def __init__(
|
|
685
|
+
self,
|
|
686
|
+
db: tuple = ("const", -50.0),
|
|
687
|
+
name: str = None,
|
|
688
|
+
prob: float = 1.0,
|
|
689
|
+
):
|
|
690
|
+
super().__init__(name=name, prob=prob)
|
|
691
|
+
|
|
692
|
+
self.db = db
|
|
693
|
+
|
|
694
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
695
|
+
db = util.sample_from_dist(self.db, state)
|
|
696
|
+
audio_data = state.randn(signal.num_channels, signal.signal_length)
|
|
697
|
+
nz_signal = AudioSignal(audio_data, signal.sample_rate)
|
|
698
|
+
nz_signal.normalize(db)
|
|
699
|
+
return {"nz_signal": nz_signal}
|
|
700
|
+
|
|
701
|
+
def _transform(self, signal, nz_signal):
|
|
702
|
+
# Clone bg_signal so that transform can be repeatedly applied
|
|
703
|
+
# to different signals with the same effect.
|
|
704
|
+
return signal + nz_signal
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
class BackgroundNoise(BaseTransform):
|
|
708
|
+
"""Adds background noise from audio specified by a set of CSV files.
|
|
709
|
+
A valid CSV file looks like, and is typically generated by
|
|
710
|
+
:py:func:`audiotools.data.preprocess.create_csv`:
|
|
711
|
+
|
|
712
|
+
.. csv-table::
|
|
713
|
+
:header: path
|
|
714
|
+
|
|
715
|
+
room_tone/m6_script2_clean.wav
|
|
716
|
+
room_tone/m6_script2_cleanraw.wav
|
|
717
|
+
room_tone/m6_script2_ipad_balcony1.wav
|
|
718
|
+
room_tone/m6_script2_ipad_bedroom1.wav
|
|
719
|
+
room_tone/m6_script2_ipad_confroom1.wav
|
|
720
|
+
room_tone/m6_script2_ipad_confroom2.wav
|
|
721
|
+
room_tone/m6_script2_ipad_livingroom1.wav
|
|
722
|
+
room_tone/m6_script2_ipad_office1.wav
|
|
723
|
+
|
|
724
|
+
.. note::
|
|
725
|
+
All paths are relative to an environment variable called ``PATH_TO_DATA``,
|
|
726
|
+
so that CSV files are portable across machines where data may be
|
|
727
|
+
located in different places.
|
|
728
|
+
|
|
729
|
+
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
|
|
730
|
+
and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the
|
|
731
|
+
hood.
|
|
732
|
+
|
|
733
|
+
Parameters
|
|
734
|
+
----------
|
|
735
|
+
snr : tuple, optional
|
|
736
|
+
Signal-to-noise ratio, by default ("uniform", 10.0, 30.0)
|
|
737
|
+
sources : List[str], optional
|
|
738
|
+
Sources containing folders, or CSVs with paths to audio files,
|
|
739
|
+
by default None
|
|
740
|
+
weights : List[float], optional
|
|
741
|
+
Weights to sample audio files from each source, by default None
|
|
742
|
+
eq_amount : tuple, optional
|
|
743
|
+
Amount of equalization to apply, by default ("const", 1.0)
|
|
744
|
+
n_bands : int, optional
|
|
745
|
+
Number of bands in equalizer, by default 3
|
|
746
|
+
name : str, optional
|
|
747
|
+
Name of this transform, used to identify it in the dictionary
|
|
748
|
+
produced by ``self.instantiate``, by default None
|
|
749
|
+
prob : float, optional
|
|
750
|
+
Probability of applying this transform, by default 1.0
|
|
751
|
+
loudness_cutoff : float, optional
|
|
752
|
+
Loudness cutoff when loading from audio files, by default None
|
|
753
|
+
"""
|
|
754
|
+
|
|
755
|
+
def __init__(
|
|
756
|
+
self,
|
|
757
|
+
snr: tuple = ("uniform", 10.0, 30.0),
|
|
758
|
+
sources: List[str] = None,
|
|
759
|
+
weights: List[float] = None,
|
|
760
|
+
eq_amount: tuple = ("const", 1.0),
|
|
761
|
+
n_bands: int = 3,
|
|
762
|
+
name: str = None,
|
|
763
|
+
prob: float = 1.0,
|
|
764
|
+
loudness_cutoff: float = None,
|
|
765
|
+
):
|
|
766
|
+
super().__init__(name=name, prob=prob)
|
|
767
|
+
|
|
768
|
+
self.snr = snr
|
|
769
|
+
self.eq_amount = eq_amount
|
|
770
|
+
self.n_bands = n_bands
|
|
771
|
+
self.loader = AudioLoader(sources, weights)
|
|
772
|
+
self.loudness_cutoff = loudness_cutoff
|
|
773
|
+
|
|
774
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
775
|
+
eq_amount = util.sample_from_dist(self.eq_amount, state)
|
|
776
|
+
eq = -eq_amount * state.rand(self.n_bands)
|
|
777
|
+
snr = util.sample_from_dist(self.snr, state)
|
|
778
|
+
|
|
779
|
+
bg_signal = self.loader(
|
|
780
|
+
state,
|
|
781
|
+
signal.sample_rate,
|
|
782
|
+
duration=signal.signal_duration,
|
|
783
|
+
loudness_cutoff=self.loudness_cutoff,
|
|
784
|
+
num_channels=signal.num_channels,
|
|
785
|
+
)["signal"]
|
|
786
|
+
|
|
787
|
+
return {"eq": eq, "bg_signal": bg_signal, "snr": snr}
|
|
788
|
+
|
|
789
|
+
def _transform(self, signal, bg_signal, snr, eq):
|
|
790
|
+
# Clone bg_signal so that transform can be repeatedly applied
|
|
791
|
+
# to different signals with the same effect.
|
|
792
|
+
return signal.mix(bg_signal.clone(), snr, eq)
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
class CrossTalk(BaseTransform):
|
|
796
|
+
"""Adds crosstalk between speakers, whose audio is drawn from a CSV file
|
|
797
|
+
that was produced via :py:func:`audiotools.data.preprocess.create_csv`.
|
|
798
|
+
|
|
799
|
+
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
|
|
800
|
+
under the hood.
|
|
801
|
+
|
|
802
|
+
Parameters
|
|
803
|
+
----------
|
|
804
|
+
snr : tuple, optional
|
|
805
|
+
How loud cross-talk speaker is relative to original signal in dB,
|
|
806
|
+
by default ("uniform", 0.0, 10.0)
|
|
807
|
+
sources : List[str], optional
|
|
808
|
+
Sources containing folders, or CSVs with paths to audio files,
|
|
809
|
+
by default None
|
|
810
|
+
weights : List[float], optional
|
|
811
|
+
Weights to sample audio files from each source, by default None
|
|
812
|
+
name : str, optional
|
|
813
|
+
Name of this transform, used to identify it in the dictionary
|
|
814
|
+
produced by ``self.instantiate``, by default None
|
|
815
|
+
prob : float, optional
|
|
816
|
+
Probability of applying this transform, by default 1.0
|
|
817
|
+
loudness_cutoff : float, optional
|
|
818
|
+
Loudness cutoff when loading from audio files, by default -40
|
|
819
|
+
"""
|
|
820
|
+
|
|
821
|
+
def __init__(
|
|
822
|
+
self,
|
|
823
|
+
snr: tuple = ("uniform", 0.0, 10.0),
|
|
824
|
+
sources: List[str] = None,
|
|
825
|
+
weights: List[float] = None,
|
|
826
|
+
name: str = None,
|
|
827
|
+
prob: float = 1.0,
|
|
828
|
+
loudness_cutoff: float = -40,
|
|
829
|
+
):
|
|
830
|
+
super().__init__(name=name, prob=prob)
|
|
831
|
+
|
|
832
|
+
self.snr = snr
|
|
833
|
+
self.loader = AudioLoader(sources, weights)
|
|
834
|
+
self.loudness_cutoff = loudness_cutoff
|
|
835
|
+
|
|
836
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
837
|
+
snr = util.sample_from_dist(self.snr, state)
|
|
838
|
+
crosstalk_signal = self.loader(
|
|
839
|
+
state,
|
|
840
|
+
signal.sample_rate,
|
|
841
|
+
duration=signal.signal_duration,
|
|
842
|
+
loudness_cutoff=self.loudness_cutoff,
|
|
843
|
+
num_channels=signal.num_channels,
|
|
844
|
+
)["signal"]
|
|
845
|
+
|
|
846
|
+
return {"crosstalk_signal": crosstalk_signal, "snr": snr}
|
|
847
|
+
|
|
848
|
+
def _transform(self, signal, crosstalk_signal, snr):
|
|
849
|
+
# Clone bg_signal so that transform can be repeatedly applied
|
|
850
|
+
# to different signals with the same effect.
|
|
851
|
+
loudness = signal.loudness()
|
|
852
|
+
mix = signal.mix(crosstalk_signal.clone(), snr)
|
|
853
|
+
mix.normalize(loudness)
|
|
854
|
+
return mix
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
class RoomImpulseResponse(BaseTransform):
|
|
858
|
+
"""Convolves signal with a room impulse response, at a specified
|
|
859
|
+
direct-to-reverberant ratio, with equalization applied. Room impulse
|
|
860
|
+
response data is drawn from a CSV file that was produced via
|
|
861
|
+
:py:func:`audiotools.data.preprocess.create_csv`.
|
|
862
|
+
|
|
863
|
+
This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir`
|
|
864
|
+
under the hood.
|
|
865
|
+
|
|
866
|
+
Parameters
|
|
867
|
+
----------
|
|
868
|
+
drr : tuple, optional
|
|
869
|
+
_description_, by default ("uniform", 0.0, 30.0)
|
|
870
|
+
sources : List[str], optional
|
|
871
|
+
Sources containing folders, or CSVs with paths to audio files,
|
|
872
|
+
by default None
|
|
873
|
+
weights : List[float], optional
|
|
874
|
+
Weights to sample audio files from each source, by default None
|
|
875
|
+
eq_amount : tuple, optional
|
|
876
|
+
Amount of equalization to apply, by default ("const", 1.0)
|
|
877
|
+
n_bands : int, optional
|
|
878
|
+
Number of bands in equalizer, by default 6
|
|
879
|
+
name : str, optional
|
|
880
|
+
Name of this transform, used to identify it in the dictionary
|
|
881
|
+
produced by ``self.instantiate``, by default None
|
|
882
|
+
prob : float, optional
|
|
883
|
+
Probability of applying this transform, by default 1.0
|
|
884
|
+
use_original_phase : bool, optional
|
|
885
|
+
Whether or not to use the original phase, by default False
|
|
886
|
+
offset : float, optional
|
|
887
|
+
Offset from each impulse response file to use, by default 0.0
|
|
888
|
+
duration : float, optional
|
|
889
|
+
Duration of each impulse response, by default 1.0
|
|
890
|
+
"""
|
|
891
|
+
|
|
892
|
+
def __init__(
|
|
893
|
+
self,
|
|
894
|
+
drr: tuple = ("uniform", 0.0, 30.0),
|
|
895
|
+
sources: List[str] = None,
|
|
896
|
+
weights: List[float] = None,
|
|
897
|
+
eq_amount: tuple = ("const", 1.0),
|
|
898
|
+
n_bands: int = 6,
|
|
899
|
+
name: str = None,
|
|
900
|
+
prob: float = 1.0,
|
|
901
|
+
use_original_phase: bool = False,
|
|
902
|
+
offset: float = 0.0,
|
|
903
|
+
duration: float = 1.0,
|
|
904
|
+
):
|
|
905
|
+
super().__init__(name=name, prob=prob)
|
|
906
|
+
|
|
907
|
+
self.drr = drr
|
|
908
|
+
self.eq_amount = eq_amount
|
|
909
|
+
self.n_bands = n_bands
|
|
910
|
+
self.use_original_phase = use_original_phase
|
|
911
|
+
|
|
912
|
+
self.loader = AudioLoader(sources, weights)
|
|
913
|
+
self.offset = offset
|
|
914
|
+
self.duration = duration
|
|
915
|
+
|
|
916
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
917
|
+
eq_amount = util.sample_from_dist(self.eq_amount, state)
|
|
918
|
+
eq = -eq_amount * state.rand(self.n_bands)
|
|
919
|
+
drr = util.sample_from_dist(self.drr, state)
|
|
920
|
+
|
|
921
|
+
ir_signal = self.loader(
|
|
922
|
+
state,
|
|
923
|
+
signal.sample_rate,
|
|
924
|
+
offset=self.offset,
|
|
925
|
+
duration=self.duration,
|
|
926
|
+
loudness_cutoff=None,
|
|
927
|
+
num_channels=signal.num_channels,
|
|
928
|
+
)["signal"]
|
|
929
|
+
ir_signal.zero_pad_to(signal.sample_rate)
|
|
930
|
+
|
|
931
|
+
return {"eq": eq, "ir_signal": ir_signal, "drr": drr}
|
|
932
|
+
|
|
933
|
+
def _transform(self, signal, ir_signal, drr, eq):
|
|
934
|
+
# Clone ir_signal so that transform can be repeatedly applied
|
|
935
|
+
# to different signals with the same effect.
|
|
936
|
+
return signal.apply_ir(
|
|
937
|
+
ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
class VolumeChange(BaseTransform):
|
|
942
|
+
"""Changes the volume of the input signal.
|
|
943
|
+
|
|
944
|
+
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.
|
|
945
|
+
|
|
946
|
+
Parameters
|
|
947
|
+
----------
|
|
948
|
+
db : tuple, optional
|
|
949
|
+
Change in volume in decibels, by default ("uniform", -12.0, 0.0)
|
|
950
|
+
name : str, optional
|
|
951
|
+
Name of this transform, used to identify it in the dictionary
|
|
952
|
+
produced by ``self.instantiate``, by default None
|
|
953
|
+
prob : float, optional
|
|
954
|
+
Probability of applying this transform, by default 1.0
|
|
955
|
+
"""
|
|
956
|
+
|
|
957
|
+
def __init__(
|
|
958
|
+
self,
|
|
959
|
+
db: tuple = ("uniform", -12.0, 0.0),
|
|
960
|
+
name: str = None,
|
|
961
|
+
prob: float = 1.0,
|
|
962
|
+
):
|
|
963
|
+
super().__init__(name=name, prob=prob)
|
|
964
|
+
self.db = db
|
|
965
|
+
|
|
966
|
+
def _instantiate(self, state: RandomState):
|
|
967
|
+
return {"db": util.sample_from_dist(self.db, state)}
|
|
968
|
+
|
|
969
|
+
def _transform(self, signal, db):
|
|
970
|
+
return signal.volume_change(db)
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
class VolumeNorm(BaseTransform):
|
|
974
|
+
"""Normalizes the volume of the excerpt to a specified decibel.
|
|
975
|
+
|
|
976
|
+
Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`.
|
|
977
|
+
|
|
978
|
+
Parameters
|
|
979
|
+
----------
|
|
980
|
+
db : tuple, optional
|
|
981
|
+
dB to normalize signal to, by default ("const", -24)
|
|
982
|
+
name : str, optional
|
|
983
|
+
Name of this transform, used to identify it in the dictionary
|
|
984
|
+
produced by ``self.instantiate``, by default None
|
|
985
|
+
prob : float, optional
|
|
986
|
+
Probability of applying this transform, by default 1.0
|
|
987
|
+
"""
|
|
988
|
+
|
|
989
|
+
def __init__(
|
|
990
|
+
self,
|
|
991
|
+
db: tuple = ("const", -24),
|
|
992
|
+
name: str = None,
|
|
993
|
+
prob: float = 1.0,
|
|
994
|
+
):
|
|
995
|
+
super().__init__(name=name, prob=prob)
|
|
996
|
+
|
|
997
|
+
self.db = db
|
|
998
|
+
|
|
999
|
+
def _instantiate(self, state: RandomState):
|
|
1000
|
+
return {"db": util.sample_from_dist(self.db, state)}
|
|
1001
|
+
|
|
1002
|
+
def _transform(self, signal, db):
|
|
1003
|
+
return signal.normalize(db)
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
class GlobalVolumeNorm(BaseTransform):
|
|
1007
|
+
"""Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this
|
|
1008
|
+
transform also normalizes the volume of a signal, but it uses
|
|
1009
|
+
the volume of the entire audio file the loaded excerpt comes from,
|
|
1010
|
+
rather than the volume of just the excerpt. The volume of the
|
|
1011
|
+
entire audio file is expected in ``signal.metadata["loudness"]``.
|
|
1012
|
+
If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv`
|
|
1013
|
+
with ``loudness = True``, like the following:
|
|
1014
|
+
|
|
1015
|
+
.. csv-table::
|
|
1016
|
+
:header: path,loudness
|
|
1017
|
+
|
|
1018
|
+
daps/produced/f1_script1_produced.wav,-16.299999237060547
|
|
1019
|
+
daps/produced/f1_script2_produced.wav,-16.600000381469727
|
|
1020
|
+
daps/produced/f1_script3_produced.wav,-17.299999237060547
|
|
1021
|
+
daps/produced/f1_script4_produced.wav,-16.100000381469727
|
|
1022
|
+
daps/produced/f1_script5_produced.wav,-16.700000762939453
|
|
1023
|
+
daps/produced/f3_script1_produced.wav,-16.5
|
|
1024
|
+
|
|
1025
|
+
The ``AudioLoader`` will automatically load the loudness column into
|
|
1026
|
+
the metadata of the signal.
|
|
1027
|
+
|
|
1028
|
+
Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.
|
|
1029
|
+
|
|
1030
|
+
Parameters
|
|
1031
|
+
----------
|
|
1032
|
+
db : tuple, optional
|
|
1033
|
+
dB to normalize signal to, by default ("const", -24)
|
|
1034
|
+
name : str, optional
|
|
1035
|
+
Name of this transform, used to identify it in the dictionary
|
|
1036
|
+
produced by ``self.instantiate``, by default None
|
|
1037
|
+
prob : float, optional
|
|
1038
|
+
Probability of applying this transform, by default 1.0
|
|
1039
|
+
"""
|
|
1040
|
+
|
|
1041
|
+
def __init__(
|
|
1042
|
+
self,
|
|
1043
|
+
db: tuple = ("const", -24),
|
|
1044
|
+
name: str = None,
|
|
1045
|
+
prob: float = 1.0,
|
|
1046
|
+
):
|
|
1047
|
+
super().__init__(name=name, prob=prob)
|
|
1048
|
+
|
|
1049
|
+
self.db = db
|
|
1050
|
+
|
|
1051
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
1052
|
+
if "loudness" not in signal.metadata:
|
|
1053
|
+
db_change = 0.0
|
|
1054
|
+
elif float(signal.metadata["loudness"]) == float("-inf"):
|
|
1055
|
+
db_change = 0.0
|
|
1056
|
+
else:
|
|
1057
|
+
db = util.sample_from_dist(self.db, state)
|
|
1058
|
+
db_change = db - float(signal.metadata["loudness"])
|
|
1059
|
+
|
|
1060
|
+
return {"db": db_change}
|
|
1061
|
+
|
|
1062
|
+
def _transform(self, signal, db):
|
|
1063
|
+
return signal.volume_change(db)
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
class Silence(BaseTransform):
|
|
1067
|
+
"""Zeros out the signal with some probability.
|
|
1068
|
+
|
|
1069
|
+
Parameters
|
|
1070
|
+
----------
|
|
1071
|
+
name : str, optional
|
|
1072
|
+
Name of this transform, used to identify it in the dictionary
|
|
1073
|
+
produced by ``self.instantiate``, by default None
|
|
1074
|
+
prob : float, optional
|
|
1075
|
+
Probability of applying this transform, by default 0.1
|
|
1076
|
+
"""
|
|
1077
|
+
|
|
1078
|
+
def __init__(self, name: str = None, prob: float = 0.1):
|
|
1079
|
+
super().__init__(name=name, prob=prob)
|
|
1080
|
+
|
|
1081
|
+
def _transform(self, signal):
|
|
1082
|
+
_loudness = signal._loudness
|
|
1083
|
+
signal = AudioSignal(
|
|
1084
|
+
torch.zeros_like(signal.audio_data),
|
|
1085
|
+
sample_rate=signal.sample_rate,
|
|
1086
|
+
stft_params=signal.stft_params,
|
|
1087
|
+
)
|
|
1088
|
+
# So that the amound of noise added is as if it wasn't silenced.
|
|
1089
|
+
# TODO: improve this hack
|
|
1090
|
+
signal._loudness = _loudness
|
|
1091
|
+
|
|
1092
|
+
return signal
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
class LowPass(BaseTransform):
|
|
1096
|
+
"""Applies a LowPass filter.
|
|
1097
|
+
|
|
1098
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`.
|
|
1099
|
+
|
|
1100
|
+
Parameters
|
|
1101
|
+
----------
|
|
1102
|
+
cutoff : tuple, optional
|
|
1103
|
+
Cutoff frequency distribution,
|
|
1104
|
+
by default ``("choice", [4000, 8000, 16000])``
|
|
1105
|
+
zeros : int, optional
|
|
1106
|
+
Number of zero-crossings in filter, argument to
|
|
1107
|
+
``julius.LowPassFilters``, by default 51
|
|
1108
|
+
name : str, optional
|
|
1109
|
+
Name of this transform, used to identify it in the dictionary
|
|
1110
|
+
produced by ``self.instantiate``, by default None
|
|
1111
|
+
prob : float, optional
|
|
1112
|
+
Probability of applying this transform, by default 1.0
|
|
1113
|
+
"""
|
|
1114
|
+
|
|
1115
|
+
def __init__(
|
|
1116
|
+
self,
|
|
1117
|
+
cutoff: tuple = ("choice", [4000, 8000, 16000]),
|
|
1118
|
+
zeros: int = 51,
|
|
1119
|
+
name: str = None,
|
|
1120
|
+
prob: float = 1,
|
|
1121
|
+
):
|
|
1122
|
+
super().__init__(name=name, prob=prob)
|
|
1123
|
+
|
|
1124
|
+
self.cutoff = cutoff
|
|
1125
|
+
self.zeros = zeros
|
|
1126
|
+
|
|
1127
|
+
def _instantiate(self, state: RandomState):
|
|
1128
|
+
return {"cutoff": util.sample_from_dist(self.cutoff, state)}
|
|
1129
|
+
|
|
1130
|
+
def _transform(self, signal, cutoff):
|
|
1131
|
+
return signal.low_pass(cutoff, zeros=self.zeros)
|
|
1132
|
+
|
|
1133
|
+
|
|
1134
|
+
class HighPass(BaseTransform):
|
|
1135
|
+
"""Applies a HighPass filter.
|
|
1136
|
+
|
|
1137
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`.
|
|
1138
|
+
|
|
1139
|
+
Parameters
|
|
1140
|
+
----------
|
|
1141
|
+
cutoff : tuple, optional
|
|
1142
|
+
Cutoff frequency distribution,
|
|
1143
|
+
by default ``("choice", [50, 100, 250, 500, 1000])``
|
|
1144
|
+
zeros : int, optional
|
|
1145
|
+
Number of zero-crossings in filter, argument to
|
|
1146
|
+
``julius.LowPassFilters``, by default 51
|
|
1147
|
+
name : str, optional
|
|
1148
|
+
Name of this transform, used to identify it in the dictionary
|
|
1149
|
+
produced by ``self.instantiate``, by default None
|
|
1150
|
+
prob : float, optional
|
|
1151
|
+
Probability of applying this transform, by default 1.0
|
|
1152
|
+
"""
|
|
1153
|
+
|
|
1154
|
+
def __init__(
|
|
1155
|
+
self,
|
|
1156
|
+
cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]),
|
|
1157
|
+
zeros: int = 51,
|
|
1158
|
+
name: str = None,
|
|
1159
|
+
prob: float = 1,
|
|
1160
|
+
):
|
|
1161
|
+
super().__init__(name=name, prob=prob)
|
|
1162
|
+
|
|
1163
|
+
self.cutoff = cutoff
|
|
1164
|
+
self.zeros = zeros
|
|
1165
|
+
|
|
1166
|
+
def _instantiate(self, state: RandomState):
|
|
1167
|
+
return {"cutoff": util.sample_from_dist(self.cutoff, state)}
|
|
1168
|
+
|
|
1169
|
+
def _transform(self, signal, cutoff):
|
|
1170
|
+
return signal.high_pass(cutoff, zeros=self.zeros)
|
|
1171
|
+
|
|
1172
|
+
|
|
1173
|
+
class RescaleAudio(BaseTransform):
|
|
1174
|
+
"""Rescales the audio so it is in between ``-val`` and ``val``
|
|
1175
|
+
only if the original audio exceeds those bounds. Useful if
|
|
1176
|
+
transforms have caused the audio to clip.
|
|
1177
|
+
|
|
1178
|
+
Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`.
|
|
1179
|
+
|
|
1180
|
+
Parameters
|
|
1181
|
+
----------
|
|
1182
|
+
val : float, optional
|
|
1183
|
+
Max absolute value of signal, by default 1.0
|
|
1184
|
+
name : str, optional
|
|
1185
|
+
Name of this transform, used to identify it in the dictionary
|
|
1186
|
+
produced by ``self.instantiate``, by default None
|
|
1187
|
+
prob : float, optional
|
|
1188
|
+
Probability of applying this transform, by default 1.0
|
|
1189
|
+
"""
|
|
1190
|
+
|
|
1191
|
+
def __init__(self, val: float = 1.0, name: str = None, prob: float = 1):
|
|
1192
|
+
super().__init__(name=name, prob=prob)
|
|
1193
|
+
|
|
1194
|
+
self.val = val
|
|
1195
|
+
|
|
1196
|
+
def _transform(self, signal):
|
|
1197
|
+
return signal.ensure_max_of_audio(self.val)
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
class ShiftPhase(SpectralTransform):
|
|
1201
|
+
"""Shifts the phase of the audio.
|
|
1202
|
+
|
|
1203
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`.
|
|
1204
|
+
|
|
1205
|
+
Parameters
|
|
1206
|
+
----------
|
|
1207
|
+
shift : tuple, optional
|
|
1208
|
+
How much to shift phase by, by default ("uniform", -np.pi, np.pi)
|
|
1209
|
+
name : str, optional
|
|
1210
|
+
Name of this transform, used to identify it in the dictionary
|
|
1211
|
+
produced by ``self.instantiate``, by default None
|
|
1212
|
+
prob : float, optional
|
|
1213
|
+
Probability of applying this transform, by default 1.0
|
|
1214
|
+
"""
|
|
1215
|
+
|
|
1216
|
+
def __init__(
|
|
1217
|
+
self,
|
|
1218
|
+
shift: tuple = ("uniform", -np.pi, np.pi),
|
|
1219
|
+
name: str = None,
|
|
1220
|
+
prob: float = 1,
|
|
1221
|
+
):
|
|
1222
|
+
super().__init__(name=name, prob=prob)
|
|
1223
|
+
self.shift = shift
|
|
1224
|
+
|
|
1225
|
+
def _instantiate(self, state: RandomState):
|
|
1226
|
+
return {"shift": util.sample_from_dist(self.shift, state)}
|
|
1227
|
+
|
|
1228
|
+
def _transform(self, signal, shift):
|
|
1229
|
+
return signal.shift_phase(shift)
|
|
1230
|
+
|
|
1231
|
+
|
|
1232
|
+
class InvertPhase(ShiftPhase):
|
|
1233
|
+
"""Inverts the phase of the audio.
|
|
1234
|
+
|
|
1235
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`.
|
|
1236
|
+
|
|
1237
|
+
Parameters
|
|
1238
|
+
----------
|
|
1239
|
+
name : str, optional
|
|
1240
|
+
Name of this transform, used to identify it in the dictionary
|
|
1241
|
+
produced by ``self.instantiate``, by default None
|
|
1242
|
+
prob : float, optional
|
|
1243
|
+
Probability of applying this transform, by default 1.0
|
|
1244
|
+
"""
|
|
1245
|
+
|
|
1246
|
+
def __init__(self, name: str = None, prob: float = 1):
|
|
1247
|
+
super().__init__(shift=("const", np.pi), name=name, prob=prob)
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
class CorruptPhase(SpectralTransform):
|
|
1251
|
+
"""Corrupts the phase of the audio.
|
|
1252
|
+
|
|
1253
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`.
|
|
1254
|
+
|
|
1255
|
+
Parameters
|
|
1256
|
+
----------
|
|
1257
|
+
scale : tuple, optional
|
|
1258
|
+
How much to corrupt phase by, by default ("uniform", 0, np.pi)
|
|
1259
|
+
name : str, optional
|
|
1260
|
+
Name of this transform, used to identify it in the dictionary
|
|
1261
|
+
produced by ``self.instantiate``, by default None
|
|
1262
|
+
prob : float, optional
|
|
1263
|
+
Probability of applying this transform, by default 1.0
|
|
1264
|
+
"""
|
|
1265
|
+
|
|
1266
|
+
def __init__(
|
|
1267
|
+
self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1
|
|
1268
|
+
):
|
|
1269
|
+
super().__init__(name=name, prob=prob)
|
|
1270
|
+
self.scale = scale
|
|
1271
|
+
|
|
1272
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
1273
|
+
scale = util.sample_from_dist(self.scale, state)
|
|
1274
|
+
corruption = state.normal(scale=scale, size=signal.phase.shape[1:])
|
|
1275
|
+
return {"corruption": corruption.astype("float32")}
|
|
1276
|
+
|
|
1277
|
+
def _transform(self, signal, corruption):
|
|
1278
|
+
return signal.shift_phase(shift=corruption)
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
class FrequencyMask(SpectralTransform):
|
|
1282
|
+
"""Masks a band of frequencies at a center frequency
|
|
1283
|
+
from the audio.
|
|
1284
|
+
|
|
1285
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`.
|
|
1286
|
+
|
|
1287
|
+
Parameters
|
|
1288
|
+
----------
|
|
1289
|
+
f_center : tuple, optional
|
|
1290
|
+
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
|
|
1291
|
+
f_width : tuple, optional
|
|
1292
|
+
Width of zero'd out band, by default ("const", 0.1)
|
|
1293
|
+
name : str, optional
|
|
1294
|
+
Name of this transform, used to identify it in the dictionary
|
|
1295
|
+
produced by ``self.instantiate``, by default None
|
|
1296
|
+
prob : float, optional
|
|
1297
|
+
Probability of applying this transform, by default 1.0
|
|
1298
|
+
"""
|
|
1299
|
+
|
|
1300
|
+
def __init__(
|
|
1301
|
+
self,
|
|
1302
|
+
f_center: tuple = ("uniform", 0.0, 1.0),
|
|
1303
|
+
f_width: tuple = ("const", 0.1),
|
|
1304
|
+
name: str = None,
|
|
1305
|
+
prob: float = 1,
|
|
1306
|
+
):
|
|
1307
|
+
super().__init__(name=name, prob=prob)
|
|
1308
|
+
self.f_center = f_center
|
|
1309
|
+
self.f_width = f_width
|
|
1310
|
+
|
|
1311
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
1312
|
+
f_center = util.sample_from_dist(self.f_center, state)
|
|
1313
|
+
f_width = util.sample_from_dist(self.f_width, state)
|
|
1314
|
+
|
|
1315
|
+
fmin = max(f_center - (f_width / 2), 0.0)
|
|
1316
|
+
fmax = min(f_center + (f_width / 2), 1.0)
|
|
1317
|
+
|
|
1318
|
+
fmin_hz = (signal.sample_rate / 2) * fmin
|
|
1319
|
+
fmax_hz = (signal.sample_rate / 2) * fmax
|
|
1320
|
+
|
|
1321
|
+
return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz}
|
|
1322
|
+
|
|
1323
|
+
def _transform(self, signal, fmin_hz: float, fmax_hz: float):
|
|
1324
|
+
return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
class TimeMask(SpectralTransform):
|
|
1328
|
+
"""Masks out contiguous time-steps from signal.
|
|
1329
|
+
|
|
1330
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`.
|
|
1331
|
+
|
|
1332
|
+
Parameters
|
|
1333
|
+
----------
|
|
1334
|
+
t_center : tuple, optional
|
|
1335
|
+
Center time in terms of 0.0 and 1.0 (duration of signal),
|
|
1336
|
+
by default ("uniform", 0.0, 1.0)
|
|
1337
|
+
t_width : tuple, optional
|
|
1338
|
+
Width of dropped out portion, by default ("const", 0.025)
|
|
1339
|
+
name : str, optional
|
|
1340
|
+
Name of this transform, used to identify it in the dictionary
|
|
1341
|
+
produced by ``self.instantiate``, by default None
|
|
1342
|
+
prob : float, optional
|
|
1343
|
+
Probability of applying this transform, by default 1.0
|
|
1344
|
+
"""
|
|
1345
|
+
|
|
1346
|
+
def __init__(
|
|
1347
|
+
self,
|
|
1348
|
+
t_center: tuple = ("uniform", 0.0, 1.0),
|
|
1349
|
+
t_width: tuple = ("const", 0.025),
|
|
1350
|
+
name: str = None,
|
|
1351
|
+
prob: float = 1,
|
|
1352
|
+
):
|
|
1353
|
+
super().__init__(name=name, prob=prob)
|
|
1354
|
+
self.t_center = t_center
|
|
1355
|
+
self.t_width = t_width
|
|
1356
|
+
|
|
1357
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal):
|
|
1358
|
+
t_center = util.sample_from_dist(self.t_center, state)
|
|
1359
|
+
t_width = util.sample_from_dist(self.t_width, state)
|
|
1360
|
+
|
|
1361
|
+
tmin = max(t_center - (t_width / 2), 0.0)
|
|
1362
|
+
tmax = min(t_center + (t_width / 2), 1.0)
|
|
1363
|
+
|
|
1364
|
+
tmin_s = signal.signal_duration * tmin
|
|
1365
|
+
tmax_s = signal.signal_duration * tmax
|
|
1366
|
+
return {"tmin_s": tmin_s, "tmax_s": tmax_s}
|
|
1367
|
+
|
|
1368
|
+
def _transform(self, signal, tmin_s: float, tmax_s: float):
|
|
1369
|
+
return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s)
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
class MaskLowMagnitudes(SpectralTransform):
|
|
1373
|
+
"""Masks low magnitude regions out of signal.
|
|
1374
|
+
|
|
1375
|
+
Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`.
|
|
1376
|
+
|
|
1377
|
+
Parameters
|
|
1378
|
+
----------
|
|
1379
|
+
db_cutoff : tuple, optional
|
|
1380
|
+
Decibel value for which things below it will be masked away,
|
|
1381
|
+
by default ("uniform", -10, 10)
|
|
1382
|
+
name : str, optional
|
|
1383
|
+
Name of this transform, used to identify it in the dictionary
|
|
1384
|
+
produced by ``self.instantiate``, by default None
|
|
1385
|
+
prob : float, optional
|
|
1386
|
+
Probability of applying this transform, by default 1.0
|
|
1387
|
+
"""
|
|
1388
|
+
|
|
1389
|
+
def __init__(
|
|
1390
|
+
self,
|
|
1391
|
+
db_cutoff: tuple = ("uniform", -10, 10),
|
|
1392
|
+
name: str = None,
|
|
1393
|
+
prob: float = 1,
|
|
1394
|
+
):
|
|
1395
|
+
super().__init__(name=name, prob=prob)
|
|
1396
|
+
self.db_cutoff = db_cutoff
|
|
1397
|
+
|
|
1398
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
1399
|
+
return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)}
|
|
1400
|
+
|
|
1401
|
+
def _transform(self, signal, db_cutoff: float):
|
|
1402
|
+
return signal.mask_low_magnitudes(db_cutoff)
|
|
1403
|
+
|
|
1404
|
+
|
|
1405
|
+
class Smoothing(BaseTransform):
|
|
1406
|
+
"""Convolves the signal with a smoothing window.
|
|
1407
|
+
|
|
1408
|
+
Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`.
|
|
1409
|
+
|
|
1410
|
+
Parameters
|
|
1411
|
+
----------
|
|
1412
|
+
window_type : tuple, optional
|
|
1413
|
+
Type of window to use, by default ("const", "average")
|
|
1414
|
+
window_length : tuple, optional
|
|
1415
|
+
Length of smoothing window, by
|
|
1416
|
+
default ("choice", [8, 16, 32, 64, 128, 256, 512])
|
|
1417
|
+
name : str, optional
|
|
1418
|
+
Name of this transform, used to identify it in the dictionary
|
|
1419
|
+
produced by ``self.instantiate``, by default None
|
|
1420
|
+
prob : float, optional
|
|
1421
|
+
Probability of applying this transform, by default 1.0
|
|
1422
|
+
"""
|
|
1423
|
+
|
|
1424
|
+
def __init__(
|
|
1425
|
+
self,
|
|
1426
|
+
window_type: tuple = ("const", "average"),
|
|
1427
|
+
window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]),
|
|
1428
|
+
name: str = None,
|
|
1429
|
+
prob: float = 1,
|
|
1430
|
+
):
|
|
1431
|
+
super().__init__(name=name, prob=prob)
|
|
1432
|
+
self.window_type = window_type
|
|
1433
|
+
self.window_length = window_length
|
|
1434
|
+
|
|
1435
|
+
def _instantiate(self, state: RandomState, signal: AudioSignal = None):
|
|
1436
|
+
window_type = util.sample_from_dist(self.window_type, state)
|
|
1437
|
+
window_length = util.sample_from_dist(self.window_length, state)
|
|
1438
|
+
window = signal.get_window(
|
|
1439
|
+
window_type=window_type, window_length=window_length, device="cpu"
|
|
1440
|
+
)
|
|
1441
|
+
return {"window": AudioSignal(window, signal.sample_rate)}
|
|
1442
|
+
|
|
1443
|
+
def _transform(self, signal, window):
|
|
1444
|
+
sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values
|
|
1445
|
+
sscale[sscale == 0.0] = 1.0
|
|
1446
|
+
|
|
1447
|
+
out = signal.convolve(window)
|
|
1448
|
+
|
|
1449
|
+
oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values
|
|
1450
|
+
oscale[oscale == 0.0] = 1.0
|
|
1451
|
+
|
|
1452
|
+
out = out * (sscale / oscale)
|
|
1453
|
+
return out
|
|
1454
|
+
|
|
1455
|
+
|
|
1456
|
+
class TimeNoise(TimeMask):
|
|
1457
|
+
"""Similar to :py:func:`audiotools.data.transforms.TimeMask`, but
|
|
1458
|
+
replaces with noise instead of zeros.
|
|
1459
|
+
|
|
1460
|
+
Parameters
|
|
1461
|
+
----------
|
|
1462
|
+
t_center : tuple, optional
|
|
1463
|
+
Center time in terms of 0.0 and 1.0 (duration of signal),
|
|
1464
|
+
by default ("uniform", 0.0, 1.0)
|
|
1465
|
+
t_width : tuple, optional
|
|
1466
|
+
Width of dropped out portion, by default ("const", 0.025)
|
|
1467
|
+
name : str, optional
|
|
1468
|
+
Name of this transform, used to identify it in the dictionary
|
|
1469
|
+
produced by ``self.instantiate``, by default None
|
|
1470
|
+
prob : float, optional
|
|
1471
|
+
Probability of applying this transform, by default 1.0
|
|
1472
|
+
"""
|
|
1473
|
+
|
|
1474
|
+
def __init__(
|
|
1475
|
+
self,
|
|
1476
|
+
t_center: tuple = ("uniform", 0.0, 1.0),
|
|
1477
|
+
t_width: tuple = ("const", 0.025),
|
|
1478
|
+
name: str = None,
|
|
1479
|
+
prob: float = 1,
|
|
1480
|
+
):
|
|
1481
|
+
super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob)
|
|
1482
|
+
|
|
1483
|
+
def _transform(self, signal, tmin_s: float, tmax_s: float):
|
|
1484
|
+
signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0)
|
|
1485
|
+
mag, phase = signal.magnitude, signal.phase
|
|
1486
|
+
|
|
1487
|
+
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
|
|
1488
|
+
mask = (mag == 0.0) * (phase == 0.0)
|
|
1489
|
+
|
|
1490
|
+
mag[mask] = mag_r[mask]
|
|
1491
|
+
phase[mask] = phase_r[mask]
|
|
1492
|
+
|
|
1493
|
+
signal.magnitude = mag
|
|
1494
|
+
signal.phase = phase
|
|
1495
|
+
return signal
|
|
1496
|
+
|
|
1497
|
+
|
|
1498
|
+
class FrequencyNoise(FrequencyMask):
|
|
1499
|
+
"""Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but
|
|
1500
|
+
replaces with noise instead of zeros.
|
|
1501
|
+
|
|
1502
|
+
Parameters
|
|
1503
|
+
----------
|
|
1504
|
+
f_center : tuple, optional
|
|
1505
|
+
Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
|
|
1506
|
+
f_width : tuple, optional
|
|
1507
|
+
Width of zero'd out band, by default ("const", 0.1)
|
|
1508
|
+
name : str, optional
|
|
1509
|
+
Name of this transform, used to identify it in the dictionary
|
|
1510
|
+
produced by ``self.instantiate``, by default None
|
|
1511
|
+
prob : float, optional
|
|
1512
|
+
Probability of applying this transform, by default 1.0
|
|
1513
|
+
"""
|
|
1514
|
+
|
|
1515
|
+
def __init__(
|
|
1516
|
+
self,
|
|
1517
|
+
f_center: tuple = ("uniform", 0.0, 1.0),
|
|
1518
|
+
f_width: tuple = ("const", 0.1),
|
|
1519
|
+
name: str = None,
|
|
1520
|
+
prob: float = 1,
|
|
1521
|
+
):
|
|
1522
|
+
super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob)
|
|
1523
|
+
|
|
1524
|
+
def _transform(self, signal, fmin_hz: float, fmax_hz: float):
|
|
1525
|
+
signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
|
|
1526
|
+
mag, phase = signal.magnitude, signal.phase
|
|
1527
|
+
|
|
1528
|
+
mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
|
|
1529
|
+
mask = (mag == 0.0) * (phase == 0.0)
|
|
1530
|
+
|
|
1531
|
+
mag[mask] = mag_r[mask]
|
|
1532
|
+
phase[mask] = phase_r[mask]
|
|
1533
|
+
|
|
1534
|
+
signal.magnitude = mag
|
|
1535
|
+
signal.phase = phase
|
|
1536
|
+
return signal
|
|
1537
|
+
|
|
1538
|
+
|
|
1539
|
+
class SpectralDenoising(Equalizer):
|
|
1540
|
+
"""Applies denoising algorithm detailed in
|
|
1541
|
+
:py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`,
|
|
1542
|
+
using a randomly generated noise signal for denoising.
|
|
1543
|
+
|
|
1544
|
+
Parameters
|
|
1545
|
+
----------
|
|
1546
|
+
eq_amount : tuple, optional
|
|
1547
|
+
Amount of eq to apply to noise signal, by default ("const", 1.0)
|
|
1548
|
+
denoise_amount : tuple, optional
|
|
1549
|
+
Amount to denoise by, by default ("uniform", 0.8, 1.0)
|
|
1550
|
+
nz_volume : float, optional
|
|
1551
|
+
Volume of noise to denoise with, by default -40
|
|
1552
|
+
n_bands : int, optional
|
|
1553
|
+
Number of bands in equalizer, by default 6
|
|
1554
|
+
n_freq : int, optional
|
|
1555
|
+
Number of frequency bins to smooth by, by default 3
|
|
1556
|
+
n_time : int, optional
|
|
1557
|
+
Number of time bins to smooth by, by default 5
|
|
1558
|
+
name : str, optional
|
|
1559
|
+
Name of this transform, used to identify it in the dictionary
|
|
1560
|
+
produced by ``self.instantiate``, by default None
|
|
1561
|
+
prob : float, optional
|
|
1562
|
+
Probability of applying this transform, by default 1.0
|
|
1563
|
+
"""
|
|
1564
|
+
|
|
1565
|
+
def __init__(
|
|
1566
|
+
self,
|
|
1567
|
+
eq_amount: tuple = ("const", 1.0),
|
|
1568
|
+
denoise_amount: tuple = ("uniform", 0.8, 1.0),
|
|
1569
|
+
nz_volume: float = -40,
|
|
1570
|
+
n_bands: int = 6,
|
|
1571
|
+
n_freq: int = 3,
|
|
1572
|
+
n_time: int = 5,
|
|
1573
|
+
name: str = None,
|
|
1574
|
+
prob: float = 1,
|
|
1575
|
+
):
|
|
1576
|
+
super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob)
|
|
1577
|
+
|
|
1578
|
+
self.nz_volume = nz_volume
|
|
1579
|
+
self.denoise_amount = denoise_amount
|
|
1580
|
+
self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time)
|
|
1581
|
+
|
|
1582
|
+
def _transform(self, signal, nz, eq, denoise_amount):
|
|
1583
|
+
nz = nz.normalize(self.nz_volume).equalizer(eq)
|
|
1584
|
+
self.spectral_gate = self.spectral_gate.to(signal.device)
|
|
1585
|
+
signal = self.spectral_gate(signal, nz, denoise_amount)
|
|
1586
|
+
return signal
|
|
1587
|
+
|
|
1588
|
+
def _instantiate(self, state: RandomState):
|
|
1589
|
+
kwargs = super()._instantiate(state)
|
|
1590
|
+
kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state)
|
|
1591
|
+
kwargs["nz"] = AudioSignal(state.randn(22050), 44100)
|
|
1592
|
+
return kwargs
|