sonusai 0.20.3__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +16 -3
- sonusai/audiofe.py +241 -77
- sonusai/calc_metric_spenh.py +71 -73
- sonusai/config/__init__.py +3 -0
- sonusai/config/config.py +61 -0
- sonusai/config/config.yml +20 -0
- sonusai/config/constants.py +8 -0
- sonusai/constants.py +11 -0
- sonusai/data/genmixdb.yml +21 -36
- sonusai/{mixture/datatypes.py → datatypes.py} +91 -130
- sonusai/deprecated/plot.py +4 -5
- sonusai/doc/doc.py +4 -4
- sonusai/doc.py +11 -4
- sonusai/genft.py +43 -45
- sonusai/genmetrics.py +25 -19
- sonusai/genmix.py +54 -82
- sonusai/genmixdb.py +88 -264
- sonusai/ir_metric.py +30 -34
- sonusai/lsdb.py +41 -48
- sonusai/main.py +15 -22
- sonusai/metrics/calc_audio_stats.py +4 -293
- sonusai/metrics/calc_class_weights.py +4 -4
- sonusai/metrics/calc_optimal_thresholds.py +8 -5
- sonusai/metrics/calc_pesq.py +2 -2
- sonusai/metrics/calc_segsnr_f.py +4 -4
- sonusai/metrics/calc_speech.py +25 -13
- sonusai/metrics/class_summary.py +7 -7
- sonusai/metrics/confusion_matrix_summary.py +5 -5
- sonusai/metrics/one_hot.py +4 -4
- sonusai/metrics/snr_summary.py +7 -7
- sonusai/metrics_summary.py +38 -45
- sonusai/mixture/__init__.py +4 -104
- sonusai/mixture/audio.py +10 -39
- sonusai/mixture/class_balancing.py +103 -0
- sonusai/mixture/config.py +251 -271
- sonusai/mixture/constants.py +35 -39
- sonusai/mixture/data_io.py +25 -36
- sonusai/mixture/db_datatypes.py +58 -22
- sonusai/mixture/effects.py +386 -0
- sonusai/mixture/feature.py +7 -11
- sonusai/mixture/generation.py +478 -628
- sonusai/mixture/helpers.py +82 -184
- sonusai/mixture/ir_delay.py +3 -4
- sonusai/mixture/ir_effects.py +77 -0
- sonusai/mixture/log_duration_and_sizes.py +6 -12
- sonusai/mixture/mixdb.py +910 -729
- sonusai/mixture/pad_audio.py +35 -0
- sonusai/mixture/resample.py +7 -0
- sonusai/mixture/sox_effects.py +195 -0
- sonusai/mixture/sox_help.py +650 -0
- sonusai/mixture/spectral_mask.py +2 -2
- sonusai/mixture/truth.py +17 -15
- sonusai/mixture/truth_functions/crm.py +12 -12
- sonusai/mixture/truth_functions/energy.py +22 -22
- sonusai/mixture/truth_functions/file.py +5 -5
- sonusai/mixture/truth_functions/metadata.py +4 -4
- sonusai/mixture/truth_functions/metrics.py +4 -4
- sonusai/mixture/truth_functions/phoneme.py +3 -3
- sonusai/mixture/truth_functions/sed.py +11 -13
- sonusai/mixture/truth_functions/target.py +10 -10
- sonusai/mkwav.py +26 -29
- sonusai/onnx_predict.py +240 -88
- sonusai/queries/__init__.py +2 -2
- sonusai/queries/queries.py +38 -34
- sonusai/speech/librispeech.py +1 -1
- sonusai/speech/mcgill.py +1 -1
- sonusai/speech/timit.py +2 -2
- sonusai/summarize_metric_spenh.py +10 -17
- sonusai/utils/__init__.py +7 -1
- sonusai/utils/asl_p56.py +2 -2
- sonusai/utils/asr.py +2 -2
- sonusai/utils/asr_functions/aaware_whisper.py +4 -5
- sonusai/utils/choice.py +31 -0
- sonusai/utils/compress.py +1 -1
- sonusai/utils/dataclass_from_dict.py +19 -1
- sonusai/utils/energy_f.py +3 -3
- sonusai/utils/evaluate_random_rule.py +15 -0
- sonusai/utils/keyboard_interrupt.py +12 -0
- sonusai/utils/onnx_utils.py +3 -17
- sonusai/utils/print_mixture_details.py +21 -19
- sonusai/utils/{temp_seed.py → rand.py} +3 -3
- sonusai/utils/read_predict_data.py +2 -2
- sonusai/utils/reshape.py +3 -3
- sonusai/utils/stratified_shuffle_split.py +3 -3
- sonusai/{mixture → utils}/tokenized_shell_vars.py +1 -1
- sonusai/utils/write_audio.py +2 -2
- sonusai/vars.py +11 -4
- {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/METADATA +4 -2
- sonusai-1.0.2.dist-info/RECORD +138 -0
- sonusai/mixture/augmentation.py +0 -444
- sonusai/mixture/class_count.py +0 -15
- sonusai/mixture/eq_rule_is_valid.py +0 -45
- sonusai/mixture/target_class_balancing.py +0 -107
- sonusai/mixture/targets.py +0 -175
- sonusai-0.20.3.dist-info/RECORD +0 -128
- {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/WHEEL +0 -0
- {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,650 @@
|
|
1
|
+
def allpass() -> str:
|
2
|
+
return """
|
3
|
+
allpass frequency[k] width[h|k|o|q]
|
4
|
+
Apply a two-pole all-pass filter with central frequency (in Hz) frequency,
|
5
|
+
and filter-width width. An all-pass filter changes the audio's frequency to
|
6
|
+
phase relationship without changing its frequency to amplitude relationship.
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
def band() -> str:
|
11
|
+
return """
|
12
|
+
band [-n] center[k] [width[h|k|o|q]]
|
13
|
+
Apply a band-pass filter. The frequency response drops logarithmically around
|
14
|
+
the center frequency. The width parameter gives the slope of the drop. The
|
15
|
+
frequencies at center + width and center - width will be half of their original
|
16
|
+
amplitudes. band defaults to a mode oriented to pitched audio, i.e. voice,
|
17
|
+
singing, or instrumental music. The -n (for noise) option uses the alternate
|
18
|
+
mode for un-pitched audio (e.g. percussion). Warning: -n introduces a power-gain
|
19
|
+
of about 11 dB in the filter, so beware of output clipping. band introduces noise
|
20
|
+
in the shape of the filter, i.e. peaking at the center frequency and settling
|
21
|
+
around it.
|
22
|
+
|
23
|
+
See also sinc for a bandpass filter with steeper shoulders.
|
24
|
+
"""
|
25
|
+
|
26
|
+
|
27
|
+
def bandpass() -> str:
|
28
|
+
return """
|
29
|
+
bandpass [-c] frequency[k] width[h|k|o|q]
|
30
|
+
Apply a two-pole Butterworth band-pass filter with central frequency frequency,
|
31
|
+
and (3 dB-point) band-width width. The -c option applies only to bandpass and
|
32
|
+
selects a constant skirt gain (peak gain = Q) instead of the default: constant
|
33
|
+
0 dB peak gain. The filters roll off at 6 dB per octave (20 dB per decade).
|
34
|
+
|
35
|
+
See also sinc for a bandpass filter with steeper shoulders.
|
36
|
+
"""
|
37
|
+
|
38
|
+
|
39
|
+
def bandreject() -> str:
|
40
|
+
return """
|
41
|
+
bandreject [-c] frequency[k] width[h|k|o|q]
|
42
|
+
Apply a two-pole Butterworth band-reject filter with central frequency frequency,
|
43
|
+
and (3 dB-point) band-width width. The -c option applies only to bandpass and
|
44
|
+
selects a constant skirt gain (peak gain = Q) instead of the default: constant
|
45
|
+
0 dB peak gain. The filters roll off at 6 dB per octave (20 dB per decade).
|
46
|
+
|
47
|
+
See also sinc for a bandpass filter with steeper shoulders.
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
def bass() -> str:
|
52
|
+
return """
|
53
|
+
bass gain [frequency[k] [width[s|h|k|o|q]]]
|
54
|
+
Boost or cut the bass (lower) frequencies of the audio using a two-pole shelving
|
55
|
+
filter with a response similar to that of a standard hi-fi's tone-controls. This
|
56
|
+
is also known as shelving equalisation (EQ).
|
57
|
+
|
58
|
+
gain gives the gain at 0 Hz. Its useful range is about -20 (for a large cut) to
|
59
|
+
+20 (for a large boost). Beware of Clipping when using a positive gain.
|
60
|
+
|
61
|
+
If desired, the filter can be fine-tuned using the following optional parameters:
|
62
|
+
|
63
|
+
frequency sets the filter's central frequency and so can be used to extend or reduce
|
64
|
+
the frequency range to be boosted or cut. The default value is 100 Hz.
|
65
|
+
|
66
|
+
width determines how steep is the filter's shelf transition. In addition to the
|
67
|
+
common width specification methods described above, 'slope' (the default, or if
|
68
|
+
appended with 's') may be used. The useful range of 'slope' is about 0.3, for a
|
69
|
+
gentle slope, to 1 (the maximum), for a steep slope; the default value is 0.5.
|
70
|
+
|
71
|
+
See also equalizer for a peaking equalisation effect.
|
72
|
+
"""
|
73
|
+
|
74
|
+
|
75
|
+
def biquad() -> str:
|
76
|
+
return """
|
77
|
+
biquad b0 b1 b2 a0 a1 a2
|
78
|
+
Apply a biquad IIR filter with the given coefficients. Where b* and a* are the numerator
|
79
|
+
and denominator coefficients respectively.
|
80
|
+
|
81
|
+
See https://en.wikipedia.org/wiki/Digital_biquad_filter (where a0 = 1).
|
82
|
+
"""
|
83
|
+
|
84
|
+
|
85
|
+
def chorus() -> str:
|
86
|
+
return """
|
87
|
+
chorus gain-in gain-out <delay decay speed depth -s|-t>
|
88
|
+
Add a chorus effect to the audio. This can make a single vocal sound like a chorus,
|
89
|
+
but can also be applied to instrumentation.
|
90
|
+
|
91
|
+
Chorus resembles an echo effect with a short delay, but whereas with echo the delay
|
92
|
+
is constant, with chorus, it is varied using sinusoidal or triangular modulation. The
|
93
|
+
modulation depth defines the range the modulated delay is played before or after the
|
94
|
+
delay. Hence the delayed sound will sound slower or faster, that is the delayed sound
|
95
|
+
tuned around the original one, like in a chorus where some vocals are slightly off key.
|
96
|
+
|
97
|
+
Each four-tuple parameter delay/decay/speed/depth gives the delay in milliseconds and
|
98
|
+
the decay (relative to gain-in) with a modulation speed in Hz using depth in
|
99
|
+
milliseconds. The modulation is either sinusoidal (-s) or triangular (-t). Gain-out is
|
100
|
+
the volume of the output.
|
101
|
+
|
102
|
+
A typical delay is around 40 ms to 60 ms; the modulation speed is best near 0.25 Hz and
|
103
|
+
the modulation depth around 2 ms. For example, a single delay:
|
104
|
+
|
105
|
+
play guitar1.wav chorus 0.7 0.9 55 0.4 0.25 2 -t
|
106
|
+
|
107
|
+
Two delays of the original samples:
|
108
|
+
|
109
|
+
play guitar1.wav chorus 0.6 0.9 50 0.4 0.25 2 -t 60 0.32 0.4 1.3 -s
|
110
|
+
|
111
|
+
A fuller sounding chorus (with three additional delays):
|
112
|
+
|
113
|
+
play guitar1.wav chorus 0.5 0.9 50 0.4 0.25 2 -t 60 0.32 0.4 2.3 -t 40 0.3 0.3 1.3 -s
|
114
|
+
"""
|
115
|
+
|
116
|
+
|
117
|
+
def compand() -> str:
|
118
|
+
return """
|
119
|
+
compand attack1,decay1{,attack2,decay2} [soft-knee-dB:]in-dB1[,out-dB1]{,in-dB2,out-dB2} [gain [initial-volume-dB [delay]]]
|
120
|
+
Compand (compress or expand) the dynamic range of the audio.
|
121
|
+
|
122
|
+
The attack and decay parameters (in seconds) determine the time over which the
|
123
|
+
instantaneous level of the input signal is averaged to determine its volume; attacks
|
124
|
+
refer to increases in volume and decays refer to decreases. For most situations, the
|
125
|
+
attack time (response to the music getting louder) should be shorter than the decay
|
126
|
+
time because the human ear is more sensitive to sudden loud music than sudden soft
|
127
|
+
music. Where more than one pair of attack/decay parameters are specified, each input
|
128
|
+
channel is companded separately and the number of pairs must agree with the number of
|
129
|
+
input channels. Typical values are 0.3,0.8 seconds.
|
130
|
+
|
131
|
+
The second parameter is a list of points on the compander's transfer function specified
|
132
|
+
in dB relative to the maximum possible signal amplitude. The input values must be in a
|
133
|
+
strictly increasing order but the transfer function does not have to be monotonically
|
134
|
+
rising. If omitted, the value of out-dB1 defaults to the same value as in-dB1; levels
|
135
|
+
below in-dB1 are not companded (but may have gain applied to them). The point 0,0 is
|
136
|
+
assumed but may be overridden (by 0,out-dBn). If the list is preceded by a soft-knee-dB
|
137
|
+
value, then the points at where adjacent line segments on the transfer function meet
|
138
|
+
will be rounded by the amount given. Typical values for the transfer function are
|
139
|
+
6:-70,-60,-20.
|
140
|
+
|
141
|
+
The third (optional) parameter is an additional gain in dB to be applied at all points on
|
142
|
+
the transfer function and allows easy adjustment of the overall gain.
|
143
|
+
|
144
|
+
The fourth (optional) parameter is an initial level to be assumed for each channel when
|
145
|
+
companding starts. This permits the user to supply a nominal level initially, so that,
|
146
|
+
for example, a very large gain is not applied to initial signal levels before the
|
147
|
+
companding action has begun to operate: it is quite probable that in such an event,
|
148
|
+
the output would be severely clipped while the compander gain properly adjusts itself.
|
149
|
+
A typical value (for audio which is initially quiet) is -90 dB.
|
150
|
+
|
151
|
+
The fifth (optional) parameter is a delay in seconds. The input signal is analysed
|
152
|
+
immediately to control the compander, but it is delayed before being fed to the volume
|
153
|
+
adjuster. Specifying a delay approximately equal to the attack/decay times allows the
|
154
|
+
compander to effectively operate in a 'predictive' rather than a reactive mode. A
|
155
|
+
typical value is 0.2 seconds.
|
156
|
+
|
157
|
+
The following example might be used to make a piece of music with both quiet and loud
|
158
|
+
passages suitable for listening to in a noisy environment such as a moving vehicle:
|
159
|
+
|
160
|
+
sox asz.wav asz-car.wav compand 0.3,1 6:-70,-60,-20 -5 -90 0.2
|
161
|
+
|
162
|
+
The transfer function ('6:-70,...') says that very soft sounds (below -70 dB) will
|
163
|
+
remain unchanged. This will stop the compander from boosting the volume on 'silent'
|
164
|
+
passages such as between movements. However, sounds in the range -60 dB to 0 dB (maximum
|
165
|
+
volume) will be boosted so that the 60 dB dynamic range of the original music will be
|
166
|
+
compressed 3-to-1 into a 20 dB range, which is wide enough to enjoy the music but narrow
|
167
|
+
enough to get around the road noise. The '6:' selects 6 dB soft-knee companding. The
|
168
|
+
-5 (dB) output gain is needed to avoid clipping (the number is inexact, and was derived
|
169
|
+
by experimentation). The -90 (dB) for the initial volume will work fine for a clip that
|
170
|
+
starts with near silence, and the delay of 0.2 (seconds) has the effect of causing the
|
171
|
+
compander to react a bit more quickly to sudden volume changes.
|
172
|
+
|
173
|
+
In the next example, compand is being used as a noise-gate for when the noise is at a
|
174
|
+
lower level than the signal:
|
175
|
+
|
176
|
+
play infile compand .1,.2 -inf,-50.1,-inf,-50,-50 0 -90 .1
|
177
|
+
|
178
|
+
Here is another noise-gate, this time for when the noise is at a higher level than the
|
179
|
+
signal (making it, in some ways, similar to squelch):
|
180
|
+
|
181
|
+
play infile compand .1,.1 -45.1,-45,-inf,0,-inf 45 -90 .1
|
182
|
+
|
183
|
+
See also mcompand for a multiple-band companding effect.
|
184
|
+
"""
|
185
|
+
|
186
|
+
|
187
|
+
def contrast() -> str:
|
188
|
+
return """
|
189
|
+
contrast [enhancement-amount(75)]
|
190
|
+
Comparable with compression, this effect modifies an audio signal to make it sound louder.
|
191
|
+
enhancement-amount controls the amount of the enhancement and is a number in the range 0-100.
|
192
|
+
Note that enhancement-amount = 0 still gives a significant contrast enhancement.
|
193
|
+
|
194
|
+
See also the compand and mcompand effects.
|
195
|
+
"""
|
196
|
+
|
197
|
+
|
198
|
+
def dcshift() -> str:
|
199
|
+
return """
|
200
|
+
dcshift shift [limitergain]
|
201
|
+
Apply a DC shift to the audio. This can be useful to remove a DC offset (caused perhaps by
|
202
|
+
a hardware problem in the recording chain) from the audio. The effect of a DC offset is
|
203
|
+
reduced headroom and hence volume. The stat or stats effect can be used to determine if a
|
204
|
+
signal has a DC offset.
|
205
|
+
|
206
|
+
The given dcshift value is a floating point number in the range of +/-2 that indicates the
|
207
|
+
amount to shift the audio (which is in the range of +/-1).
|
208
|
+
|
209
|
+
An optional limitergain can be specified as well. It should have a value much less than 1
|
210
|
+
(e.g. 0.05 or 0.02) and is used only on peaks to prevent clipping.
|
211
|
+
|
212
|
+
An alternative approach to removing a DC offset (albeit with a short delay) is to use the
|
213
|
+
highpass filter effect at a frequency of say 10 Hz, as illustrated in the following example:
|
214
|
+
|
215
|
+
sox -n dc.wav synth 5 sin %0 50
|
216
|
+
sox dc.wav fixed.wav highpass 10
|
217
|
+
"""
|
218
|
+
|
219
|
+
|
220
|
+
def equalizer() -> str:
|
221
|
+
return """
|
222
|
+
equalizer frequency[k] width[q|o|h|k] gain
|
223
|
+
Apply a two-pole peaking equalisation (EQ) filter. With this filter, the signal-level at and
|
224
|
+
around a selected frequency can be increased or decreased, whilst (unlike band-pass and
|
225
|
+
band-reject filters) that at all other frequencies is unchanged.
|
226
|
+
|
227
|
+
frequency gives the filter's central frequency in Hz, width, the band-width, and gain the
|
228
|
+
required gain or attenuation in dB. Beware of Clipping when using a positive gain.
|
229
|
+
|
230
|
+
In order to produce complex equalisation curves, this effect can be given several times, each
|
231
|
+
with a different central frequency.
|
232
|
+
|
233
|
+
See also bass and treble for shelving equalisation effects.
|
234
|
+
"""
|
235
|
+
|
236
|
+
|
237
|
+
def flanger() -> str:
|
238
|
+
return """
|
239
|
+
flanger [delay depth regen width speed shape phase interp]
|
240
|
+
Apply a flanging effect to the audio.
|
241
|
+
|
242
|
+
All parameters are optional (right to left).
|
243
|
+
"""
|
244
|
+
|
245
|
+
|
246
|
+
def gain() -> str:
|
247
|
+
return """
|
248
|
+
gain [-e|-B|-b|-r] [-n] [-l|-h] [gain-dB]
|
249
|
+
Apply amplification or attenuation to the audio signal, or, in some cases, to some of its
|
250
|
+
channels. Note that use of any of -e, -B, -b, -r, or -n requires temporary file space to store
|
251
|
+
the audio to be processed, so may be unsuitable for use with 'streamed' audio.
|
252
|
+
|
253
|
+
Without other options, gain-dB is used to adjust the signal power level by the given number of
|
254
|
+
dB: positive amplifies (beware of Clipping), negative attenuates. With other options, the
|
255
|
+
gain-dB amplification or attenuation is (logically) applied after the processing due to those
|
256
|
+
options.
|
257
|
+
|
258
|
+
Given the -e option, the levels of the audio channels of a multi-channel file are 'equalised',
|
259
|
+
i.e. gain is applied to all channels other than that with the highest peak level, such that all
|
260
|
+
channels attain the same peak level (but, without also giving -n, the audio is not 'normalised').
|
261
|
+
|
262
|
+
The -B (balance) option is similar to -e, but with -B, the RMS level is used instead of the peak
|
263
|
+
level. -B might be used to correct stereo imbalance caused by an imperfect record turntable
|
264
|
+
cartridge. Note that unlike -e, -B might cause some clipping.
|
265
|
+
|
266
|
+
-b is similar to -B but has clipping protection, i.e. if necessary to prevent clipping whilst
|
267
|
+
balancing, attenuation is applied to all channels. Note, however, that in conjunction with
|
268
|
+
-n, -B and -b are synonymous.
|
269
|
+
|
270
|
+
The -r option is used in conjunction with a prior invocation of gain with the -h option - see below
|
271
|
+
for details.
|
272
|
+
|
273
|
+
The -n option normalises the audio to 0 dB FSD; it is often used in conjunction with a negative
|
274
|
+
gain-dB to the effect that the audio is normalised to a given level below 0 dB. For example,
|
275
|
+
|
276
|
+
sox infile outfile gain -n
|
277
|
+
|
278
|
+
normalises to 0 dB, and
|
279
|
+
|
280
|
+
sox infile outfile gain -n -3
|
281
|
+
|
282
|
+
normalises to -3 dB.
|
283
|
+
|
284
|
+
The -l option invokes a simple limiter, e.g.
|
285
|
+
|
286
|
+
sox infile outfile gain -l 6
|
287
|
+
|
288
|
+
will apply 6 dB of gain but never clip. Note that limiting more than a few dBs more than
|
289
|
+
occasionally (in a piece of audio) is not recommended as it can cause audible distortion. See
|
290
|
+
the compand effect for a more capable limiter.
|
291
|
+
|
292
|
+
The -h option is used to apply gain to provide head-room for subsequent processing.
|
293
|
+
For example, with
|
294
|
+
|
295
|
+
sox infile outfile gain -h bass +6
|
296
|
+
|
297
|
+
6 dB of attenuation will be applied prior to the bass boosting effect thus ensuring that it will
|
298
|
+
not clip. Of course, with bass, it is obvious how much headroom will be needed, but with other
|
299
|
+
effects (e.g. rate, dither) it is not always as clear. Another advantage of using gain -h rather
|
300
|
+
than an explicit attenuation, is that if the headroom is not used by subsequent effects, it can
|
301
|
+
be reclaimed with gain -r, for example:
|
302
|
+
|
303
|
+
sox infile outfile gain -h bass +6 rate 44100 gain -r
|
304
|
+
|
305
|
+
The above effects chain guarantees never to clip nor amplify; it attenuates if necessary to
|
306
|
+
prevent clipping, but by only as much as is needed to do so.
|
307
|
+
|
308
|
+
Output formatting (dithering and bit-depth reduction) also requires headroom (which cannot be
|
309
|
+
'reclaimed'), e.g.
|
310
|
+
|
311
|
+
sox infile outfile gain -h bass +6 rate 44100 gain -rh dither
|
312
|
+
|
313
|
+
Here, the second gain invocation, reclaims as much of the headroom as it can from the preceding
|
314
|
+
effects, but retains as much headroom as is needed for subsequent processing.
|
315
|
+
|
316
|
+
See also the norm and vol effects.
|
317
|
+
"""
|
318
|
+
|
319
|
+
|
320
|
+
def highpass() -> str:
|
321
|
+
return """
|
322
|
+
highpass [-1|-2] frequency[k] [width[q|o|h|k]]
|
323
|
+
Apply a high-pass filter with 3 dB point frequency. The filter can be either single-pole (with -1),
|
324
|
+
or double-pole (the default, or with -2). width applies only to double-pole filters; the default
|
325
|
+
is Q = 0.707 and gives a Butterworth response. The filters roll off at 6 dB per pole per octave
|
326
|
+
(20 dB per pole per decade).
|
327
|
+
|
328
|
+
See also sinc for filters with a steeper roll-off.
|
329
|
+
"""
|
330
|
+
|
331
|
+
|
332
|
+
def hilbert() -> str:
|
333
|
+
return """
|
334
|
+
hilbert [-n taps]
|
335
|
+
Apply an odd-tap Hilbert transform filter, phase-shifting the signal by 90 degrees.
|
336
|
+
|
337
|
+
This is used in many matrix coding schemes and for analytic signal generation. The process is
|
338
|
+
often written as a multiplication by i (or j), the imaginary unit.
|
339
|
+
|
340
|
+
An odd-tap Hilbert transform filter has a bandpass characteristic, attenuating the lowest and
|
341
|
+
highest frequencies. Its bandwidth can be controlled by the number of filter taps, which can be
|
342
|
+
specified with -n. By default, the number of taps is chosen for a cutoff frequency of about 75 Hz.
|
343
|
+
"""
|
344
|
+
|
345
|
+
|
346
|
+
def loudness() -> str:
|
347
|
+
return """
|
348
|
+
loudness [gain [reference]]
|
349
|
+
Loudness control - similar to the gain effect, but provides equalisation for the human auditory
|
350
|
+
system. See https://en.wikipedia.org/wiki/Loudness for a detailed description of loudness. The
|
351
|
+
gain is adjusted by the given gain parameter (usually negative) and the signal equalised
|
352
|
+
according to ISO 226 w.r.t. a reference level of 65 dB, though an alternative reference level
|
353
|
+
may be given if the original audio has been equalised for some other optimal level. A default
|
354
|
+
gain of -10 dB is used if a gain value is not given.
|
355
|
+
|
356
|
+
See also the gain effect.
|
357
|
+
"""
|
358
|
+
|
359
|
+
|
360
|
+
def lowpass() -> str:
|
361
|
+
return """
|
362
|
+
lowpass [-1|-2] frequency[k] [width[q|o|h|k]]
|
363
|
+
Apply a low-pass filter with 3 dB point frequency. The filter can be either single-pole (with -1),
|
364
|
+
or double-pole (the default, or with -2). width applies only to double-pole filters; the default
|
365
|
+
is Q = 0.707 and gives a Butterworth response. The filters roll off at 6 dB per pole per octave
|
366
|
+
(20 dB per pole per decade).
|
367
|
+
|
368
|
+
See also sinc for filters with a steeper roll-off.
|
369
|
+
"""
|
370
|
+
|
371
|
+
|
372
|
+
def mcompand() -> str:
|
373
|
+
return """
|
374
|
+
mcompand "attack1,decay1{,attack2,decay2} [soft-knee-dB:]in-dB1[,out-dB1]{,in-dB2,out-dB2} [gain [initial-volume-dB [delay]]]" {crossover-freq[k] "attack1,..."}
|
375
|
+
The multi-band compander is similar to the single-band compander but the audio is first divided
|
376
|
+
into bands using Linkwitz-Riley cross-over filters and a separately specifiable compander run on
|
377
|
+
each band. See the compand effect for the definition of its parameters. Compand parameters are
|
378
|
+
specified between double quotes and the crossover frequency for that band is given by
|
379
|
+
crossover-freq; these can be repeated to create multiple bands.
|
380
|
+
|
381
|
+
For example, the following (one long) command shows how multi-band companding is typically used
|
382
|
+
in FM radio:
|
383
|
+
|
384
|
+
play track1.wav gain -3 sinc 8000- 29 100 mcompand \
|
385
|
+
"0.005,0.1 -47,-40,-34,-34,-17,-33" 100 \
|
386
|
+
"0.003,0.05 -47,-40,-34,-34,-17,-33" 400 \
|
387
|
+
"0.000625,0.0125 -47,-40,-34,-34,-15,-33" 1600 \
|
388
|
+
"0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30" 6400 \
|
389
|
+
"0,0.025 -38,-31,-28,-28,-0,-25" \
|
390
|
+
gain 15 highpass 22 highpass 22 sinc -n 255 -b 16 -17500 \
|
391
|
+
gain 9 lowpass -1 17801
|
392
|
+
|
393
|
+
The audio file is played with a simulated FM radio sound (or broadcast signal condition if the
|
394
|
+
lowpass filter at the end is skipped). Note that the pipeline is set up with US-style 75 us
|
395
|
+
pre-emphasis.
|
396
|
+
|
397
|
+
See also compand for a single-band companding effect.
|
398
|
+
"""
|
399
|
+
|
400
|
+
|
401
|
+
def norm() -> str:
|
402
|
+
return """
|
403
|
+
norm [dB-level]
|
404
|
+
Normalise the audio. norm is just an alias for gain -n; see the gain effect for details.
|
405
|
+
"""
|
406
|
+
|
407
|
+
|
408
|
+
def overdrive() -> str:
|
409
|
+
return """
|
410
|
+
overdrive [gain(20) [colour(20)]]
|
411
|
+
Non linear distortion. The colour parameter controls the amount of even harmonic content in the
|
412
|
+
over-driven output.
|
413
|
+
"""
|
414
|
+
|
415
|
+
|
416
|
+
def phaser() -> str:
|
417
|
+
return """
|
418
|
+
phaser gain-in gain-out delay decay speed [-s|-t]
|
419
|
+
Add a phasing effect to the audio.
|
420
|
+
|
421
|
+
delay/decay/speed gives the delay in milliseconds and the decay (relative to gain-in) with a
|
422
|
+
modulation speed in Hz. The modulation is either sinusoidal (-s) - preferable for multiple
|
423
|
+
instruments, or triangular (-t) - gives single instruments a sharper phasing effect. The decay
|
424
|
+
should be less than 0.5 to avoid feedback, and usually no less than 0.1. Gain-out is the
|
425
|
+
volume of the output.
|
426
|
+
|
427
|
+
For example:
|
428
|
+
|
429
|
+
play snare.flac phaser 0.8 0.74 3 0.4 0.5 -t
|
430
|
+
|
431
|
+
Gentler:
|
432
|
+
|
433
|
+
play snare.flac phaser 0.9 0.85 4 0.23 1.3 -s
|
434
|
+
|
435
|
+
A popular sound:
|
436
|
+
|
437
|
+
play snare.flac phaser 0.89 0.85 1 0.24 2 -t
|
438
|
+
|
439
|
+
More severe:
|
440
|
+
|
441
|
+
play snare.flac phaser 0.6 0.66 3 0.6 2 -t
|
442
|
+
"""
|
443
|
+
|
444
|
+
|
445
|
+
def pitch() -> str:
|
446
|
+
return """
|
447
|
+
pitch [-q] shift [segment [search [overlap]]]
|
448
|
+
Change the audio pitch (but not tempo).
|
449
|
+
|
450
|
+
shift gives the pitch shift as positive or negative 'cents' (i.e. 100ths of a semitone). See
|
451
|
+
the tempo effect for a description of the other parameters.
|
452
|
+
|
453
|
+
See also the bend, speed, and tempo effects.
|
454
|
+
"""
|
455
|
+
|
456
|
+
|
457
|
+
def reverb() -> str:
|
458
|
+
return """
|
459
|
+
reverb [-w|--wet-only] [reverberance (50%) [HF-damping (50%) [room-scale (100%) [stereo-depth (100%) [pre-delay (0 ms) [wet-gain (0 dB)]]]]]]
|
460
|
+
Add reverberation to the audio using the 'freeverb' algorithm. A reverberation effect is
|
461
|
+
sometimes desirable for concert halls that are too small or contain so many people that the
|
462
|
+
hall's natural reverberance is diminished. Applying a small amount of stereo reverb to a (dry)
|
463
|
+
mono signal will usually make it sound more natural. See [3] for a detailed description of
|
464
|
+
reverberation.
|
465
|
+
|
466
|
+
Note that this effect increases both the volume and the length of the audio, so to prevent
|
467
|
+
clipping in these domains, a typical invocation might be:
|
468
|
+
|
469
|
+
play dry.wav gain -3 pad 0 3 reverb
|
470
|
+
|
471
|
+
The -w option can be given to select only the 'wet' signal, thus allowing it to be processed
|
472
|
+
further, independently of the 'dry' signal. E.g.
|
473
|
+
|
474
|
+
play -m voice.wav "|sox voice.wav -p reverse reverb -w reverse"
|
475
|
+
|
476
|
+
for a reverse reverb effect.
|
477
|
+
"""
|
478
|
+
|
479
|
+
|
480
|
+
def sinc() -> str:
|
481
|
+
return """
|
482
|
+
sinc [-a att|-b beta] [-p phase|-M|-I|-L] [-t tbw|-n taps] [freqHP] [-freqLP [-t tbw|-n taps]]
|
483
|
+
Apply a sinc kaiser-windowed low-pass, high-pass, band-pass, or band-reject filter to the
|
484
|
+
signal. The freqHP and freqLP parameters give the frequencies of the 6 dB points of a
|
485
|
+
high-pass and low-pass filter that may be invoked individually, or together. If both are
|
486
|
+
given, then freqHP less than freqLP creates a band-pass filter, freqHP greater than freqLP
|
487
|
+
creates a band-reject filter. For example, the invocations
|
488
|
+
|
489
|
+
sinc 3k
|
490
|
+
sinc -4k
|
491
|
+
sinc 3k-4k
|
492
|
+
sinc 4k-3k
|
493
|
+
|
494
|
+
create a high-pass, low-pass, band-pass, and band-reject filter respectively.
|
495
|
+
|
496
|
+
The default stop-band attenuation of 120 dB can be overridden with -a; alternatively, the
|
497
|
+
kaiser-window 'beta' parameter can be given directly with -b.
|
498
|
+
|
499
|
+
The default transition band-width of 5% of the total band can be overridden with -t (and
|
500
|
+
tbw in Hertz); alternatively, the number of filter taps can be given directly with -n.
|
501
|
+
|
502
|
+
If both freqHP and freqLP are given, then a -t or -n option given to the left of the
|
503
|
+
frequencies applies to both frequencies; one of these options given to the right of the
|
504
|
+
frequencies applies only to freqLP.
|
505
|
+
|
506
|
+
The -p, -M, -I, and -L options control the filter's phase response; see the rate effect
|
507
|
+
for details.
|
508
|
+
"""
|
509
|
+
|
510
|
+
|
511
|
+
def speed() -> str:
|
512
|
+
return """
|
513
|
+
speed factor[c]
|
514
|
+
Adjust the audio speed (pitch and tempo together). factor is either the ratio of the new
|
515
|
+
speed to the old speed: greater than 1 speeds up, less than 1 slows down, or, if appended
|
516
|
+
with the letter 'c', the number of cents (i.e. 100ths of a semitone) by which the pitch
|
517
|
+
(and tempo) should be adjusted: greater than 0 increases, less than 0 decreases.
|
518
|
+
|
519
|
+
Technically, the speed effect only changes the sample rate information, leaving the samples
|
520
|
+
themselves untouched. The rate effect is invoked automatically to resample to the output
|
521
|
+
sample rate, using its default quality/speed. For higher quality or higher speed
|
522
|
+
resampling, in addition to the speed effect, specify the rate effect with the desired
|
523
|
+
quality option.
|
524
|
+
|
525
|
+
See also the bend, pitch, and tempo effects.
|
526
|
+
"""
|
527
|
+
|
528
|
+
|
529
|
+
def tempo() -> str:
|
530
|
+
return """
|
531
|
+
tempo [-q] [-m|-s|-l] factor [segment [search [overlap]]]
|
532
|
+
Change the audio playback speed but not its pitch. This effect uses the WSOLA algorithm. The
|
533
|
+
audio is chopped up into segments which are then shifted in the time domain and overlapped
|
534
|
+
(cross-faded) at points where their waveforms are most similar as determined by measurement
|
535
|
+
of 'least squares'.
|
536
|
+
|
537
|
+
By default, linear searches are used to find the best overlapping points. If the optional -q
|
538
|
+
parameter is given, tree searches are used instead. This makes the effect work more quickly,
|
539
|
+
but the result may not sound as good. However, if you must improve the processing speed,
|
540
|
+
this generally reduces the sound quality less than reducing the search or overlap values.
|
541
|
+
|
542
|
+
The -m option is used to optimize default values of segment, search and overlap for music
|
543
|
+
processing.
|
544
|
+
|
545
|
+
The -s option is used to optimize default values of segment, search and overlap for speech
|
546
|
+
processing.
|
547
|
+
|
548
|
+
The -l option is used to optimize default values of segment, search and overlap for 'linear'
|
549
|
+
processing that tends to cause more noticeable distortion but may be useful when factor is
|
550
|
+
close to 1.
|
551
|
+
|
552
|
+
If -m, -s, or -l is specified, the default value of segment will be calculated based on
|
553
|
+
factor, while default search and overlap values are based on segment. Any values you provide
|
554
|
+
still override these default values.
|
555
|
+
|
556
|
+
factor gives the ratio of new tempo to the old tempo, so e.g. 1.1 speeds up the tempo by
|
557
|
+
10%, and 0.9 slows it down by 10%.
|
558
|
+
|
559
|
+
The optional segment parameter selects the algorithm's segment size in milliseconds. If no
|
560
|
+
other flags are specified, the default value is 82 and is typically suited to making small
|
561
|
+
changes to the tempo of music. For larger changes (e.g. a factor of 2), 41 ms may give a
|
562
|
+
better result. The -m, -s, and -l flags will cause the segment default to be automatically
|
563
|
+
adjusted based on factor. For example using -s (for speech) with a tempo of 1.25 will
|
564
|
+
calculate a default segment value of 32.
|
565
|
+
|
566
|
+
The optional search parameter gives the audio length in milliseconds over which the algorithm
|
567
|
+
will search for overlapping points. If no other flags are specified, the default value is
|
568
|
+
14.68. Larger values use more processing time and may or may not produce better results. A
|
569
|
+
practical maximum is half the value of segment. Search can be reduced to cut processing time
|
570
|
+
at the risk of degrading output quality. The -m, -s, and -l flags will cause the search
|
571
|
+
default to be automatically adjusted based on segment.
|
572
|
+
|
573
|
+
The optional overlap parameter gives the segment overlap length in milliseconds. Default
|
574
|
+
value is 12, but -m, -s, or -l flags automatically adjust overlap based on segment size.
|
575
|
+
Increasing overlap increases processing time and may increase quality. A practical maximum
|
576
|
+
for overlap is the value of search, with overlap typically being (at least) a little
|
577
|
+
smaller then search.
|
578
|
+
|
579
|
+
See also speed for an effect that changes tempo and pitch together, pitch and bend for
|
580
|
+
effects that change pitch only, and stretch for an effect that changes tempo using a
|
581
|
+
different algorithm.
|
582
|
+
"""
|
583
|
+
|
584
|
+
|
585
|
+
def treble() -> str:
|
586
|
+
return """
|
587
|
+
treble gain [frequency[k] [width[s|h|k|o|q]]]
|
588
|
+
Boost or cut the treble (upper) frequencies of the audio using a two-pole shelving
|
589
|
+
filter with a response similar to that of a standard hi-fi's tone-controls. This is
|
590
|
+
also known as shelving equalisation (EQ).
|
591
|
+
|
592
|
+
gain gives the gain at whichever is the lower of ~22 kHz and the Nyquist frequency.
|
593
|
+
Its useful range is about -20 (for a large cut) to +20 (for a large boost). Beware
|
594
|
+
of Clipping when using a positive gain.
|
595
|
+
|
596
|
+
If desired, the filter can be fine-tuned using the following optional parameters:
|
597
|
+
|
598
|
+
frequency sets the filter's central frequency and so can be used to extend or reduce
|
599
|
+
the frequency range to be boosted or cut. The default value is 3 kHz.
|
600
|
+
|
601
|
+
width determines how steep is the filter's shelf transition. In addition to the
|
602
|
+
common width specification methods described above, 'slope' (the default, or if
|
603
|
+
appended with 's') may be used. The useful range of 'slope' is about 0.3, for a
|
604
|
+
gentle slope, to 1 (the maximum), for a steep slope; the default value is 0.5.
|
605
|
+
|
606
|
+
See also equalizer for a peaking equalisation effect.
|
607
|
+
"""
|
608
|
+
|
609
|
+
|
610
|
+
def tremolo() -> str:
|
611
|
+
return """
|
612
|
+
tremolo speed [depth]
|
613
|
+
Apply a tremolo (low frequency amplitude modulation) effect to the audio. The
|
614
|
+
tremolo frequency in Hz is given by speed, and the depth as a percentage by
|
615
|
+
depth (default 40).
|
616
|
+
"""
|
617
|
+
|
618
|
+
|
619
|
+
def vol() -> str:
|
620
|
+
return """
|
621
|
+
vol gain [type [limitergain]]
|
622
|
+
Apply an amplification or an attenuation to the audio signal. Unlike the -v option
|
623
|
+
(which is used for balancing multiple input files as they enter the sox effects
|
624
|
+
processing chain), vol is an effect like any other so can be applied anywhere, and
|
625
|
+
several times if necessary, during the processing chain.
|
626
|
+
|
627
|
+
The amount to change the volume is given by gain which is interpreted, according
|
628
|
+
to the given type, as follows: if type is amplitude (or is omitted), then gain is
|
629
|
+
an amplitude (i.e. voltage or linear) ratio, if power, then a power (i.e. wattage
|
630
|
+
or voltage-squared) ratio, and if dB, then a power change in dB.
|
631
|
+
|
632
|
+
When type is amplitude or power, a gain of 1 leaves the volume unchanged, less
|
633
|
+
than 1 decreases it, and greater than 1 increases it; a negative gain inverts the
|
634
|
+
audio signal in addition to adjusting its volume.
|
635
|
+
|
636
|
+
When type is dB, a gain of 0 leaves the volume unchanged, less than 0 decreases
|
637
|
+
it, and greater than 0 increases it.
|
638
|
+
|
639
|
+
Beware of Clipping when the increasing the volume.
|
640
|
+
|
641
|
+
The gain and the type parameters can be concatenated if desired, e.g. vol 10 dB.
|
642
|
+
|
643
|
+
An optional limitergain value can be specified and should be a value much less
|
644
|
+
than 1 (e.g. 0.05 or 0.02) and is used only on peaks to prevent clipping. Not
|
645
|
+
specifying this parameter will cause no limiter to be used. In verbose mode,
|
646
|
+
this effect will display the percentage of the audio that needed to be limited.
|
647
|
+
|
648
|
+
See also gain for a volume-changing effect with different capabilities, and
|
649
|
+
compand for a dynamic-range compression/expansion/limiting effect.
|
650
|
+
"""
|
sonusai/mixture/spectral_mask.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
from
|
2
|
-
from
|
1
|
+
from ..datatypes import AudioF
|
2
|
+
from ..datatypes import SpectralMask
|
3
3
|
|
4
4
|
|
5
5
|
def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int | None = None) -> AudioF:
|