sonusai 0.20.3__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sonusai/__init__.py +16 -3
  2. sonusai/audiofe.py +241 -77
  3. sonusai/calc_metric_spenh.py +71 -73
  4. sonusai/config/__init__.py +3 -0
  5. sonusai/config/config.py +61 -0
  6. sonusai/config/config.yml +20 -0
  7. sonusai/config/constants.py +8 -0
  8. sonusai/constants.py +11 -0
  9. sonusai/data/genmixdb.yml +21 -36
  10. sonusai/{mixture/datatypes.py → datatypes.py} +91 -130
  11. sonusai/deprecated/plot.py +4 -5
  12. sonusai/doc/doc.py +4 -4
  13. sonusai/doc.py +11 -4
  14. sonusai/genft.py +43 -45
  15. sonusai/genmetrics.py +25 -19
  16. sonusai/genmix.py +54 -82
  17. sonusai/genmixdb.py +88 -264
  18. sonusai/ir_metric.py +30 -34
  19. sonusai/lsdb.py +41 -48
  20. sonusai/main.py +15 -22
  21. sonusai/metrics/calc_audio_stats.py +4 -293
  22. sonusai/metrics/calc_class_weights.py +4 -4
  23. sonusai/metrics/calc_optimal_thresholds.py +8 -5
  24. sonusai/metrics/calc_pesq.py +2 -2
  25. sonusai/metrics/calc_segsnr_f.py +4 -4
  26. sonusai/metrics/calc_speech.py +25 -13
  27. sonusai/metrics/class_summary.py +7 -7
  28. sonusai/metrics/confusion_matrix_summary.py +5 -5
  29. sonusai/metrics/one_hot.py +4 -4
  30. sonusai/metrics/snr_summary.py +7 -7
  31. sonusai/metrics_summary.py +38 -45
  32. sonusai/mixture/__init__.py +4 -104
  33. sonusai/mixture/audio.py +10 -39
  34. sonusai/mixture/class_balancing.py +103 -0
  35. sonusai/mixture/config.py +251 -271
  36. sonusai/mixture/constants.py +35 -39
  37. sonusai/mixture/data_io.py +25 -36
  38. sonusai/mixture/db_datatypes.py +58 -22
  39. sonusai/mixture/effects.py +386 -0
  40. sonusai/mixture/feature.py +7 -11
  41. sonusai/mixture/generation.py +478 -628
  42. sonusai/mixture/helpers.py +82 -184
  43. sonusai/mixture/ir_delay.py +3 -4
  44. sonusai/mixture/ir_effects.py +77 -0
  45. sonusai/mixture/log_duration_and_sizes.py +6 -12
  46. sonusai/mixture/mixdb.py +910 -729
  47. sonusai/mixture/pad_audio.py +35 -0
  48. sonusai/mixture/resample.py +7 -0
  49. sonusai/mixture/sox_effects.py +195 -0
  50. sonusai/mixture/sox_help.py +650 -0
  51. sonusai/mixture/spectral_mask.py +2 -2
  52. sonusai/mixture/truth.py +17 -15
  53. sonusai/mixture/truth_functions/crm.py +12 -12
  54. sonusai/mixture/truth_functions/energy.py +22 -22
  55. sonusai/mixture/truth_functions/file.py +5 -5
  56. sonusai/mixture/truth_functions/metadata.py +4 -4
  57. sonusai/mixture/truth_functions/metrics.py +4 -4
  58. sonusai/mixture/truth_functions/phoneme.py +3 -3
  59. sonusai/mixture/truth_functions/sed.py +11 -13
  60. sonusai/mixture/truth_functions/target.py +10 -10
  61. sonusai/mkwav.py +26 -29
  62. sonusai/onnx_predict.py +240 -88
  63. sonusai/queries/__init__.py +2 -2
  64. sonusai/queries/queries.py +38 -34
  65. sonusai/speech/librispeech.py +1 -1
  66. sonusai/speech/mcgill.py +1 -1
  67. sonusai/speech/timit.py +2 -2
  68. sonusai/summarize_metric_spenh.py +10 -17
  69. sonusai/utils/__init__.py +7 -1
  70. sonusai/utils/asl_p56.py +2 -2
  71. sonusai/utils/asr.py +2 -2
  72. sonusai/utils/asr_functions/aaware_whisper.py +4 -5
  73. sonusai/utils/choice.py +31 -0
  74. sonusai/utils/compress.py +1 -1
  75. sonusai/utils/dataclass_from_dict.py +19 -1
  76. sonusai/utils/energy_f.py +3 -3
  77. sonusai/utils/evaluate_random_rule.py +15 -0
  78. sonusai/utils/keyboard_interrupt.py +12 -0
  79. sonusai/utils/onnx_utils.py +3 -17
  80. sonusai/utils/print_mixture_details.py +21 -19
  81. sonusai/utils/{temp_seed.py → rand.py} +3 -3
  82. sonusai/utils/read_predict_data.py +2 -2
  83. sonusai/utils/reshape.py +3 -3
  84. sonusai/utils/stratified_shuffle_split.py +3 -3
  85. sonusai/{mixture → utils}/tokenized_shell_vars.py +1 -1
  86. sonusai/utils/write_audio.py +2 -2
  87. sonusai/vars.py +11 -4
  88. {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/METADATA +4 -2
  89. sonusai-1.0.2.dist-info/RECORD +138 -0
  90. sonusai/mixture/augmentation.py +0 -444
  91. sonusai/mixture/class_count.py +0 -15
  92. sonusai/mixture/eq_rule_is_valid.py +0 -45
  93. sonusai/mixture/target_class_balancing.py +0 -107
  94. sonusai/mixture/targets.py +0 -175
  95. sonusai-0.20.3.dist-info/RECORD +0 -128
  96. {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/WHEEL +0 -0
  97. {sonusai-0.20.3.dist-info → sonusai-1.0.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,650 @@
1
+ def allpass() -> str:
2
+ return """
3
+ allpass frequency[k] width[h|k|o|q]
4
+ Apply a two-pole all-pass filter with central frequency (in Hz) frequency,
5
+ and filter-width width. An all-pass filter changes the audio's frequency to
6
+ phase relationship without changing its frequency to amplitude relationship.
7
+ """
8
+
9
+
10
+ def band() -> str:
11
+ return """
12
+ band [-n] center[k] [width[h|k|o|q]]
13
+ Apply a band-pass filter. The frequency response drops logarithmically around
14
+ the center frequency. The width parameter gives the slope of the drop. The
15
+ frequencies at center + width and center - width will be half of their original
16
+ amplitudes. band defaults to a mode oriented to pitched audio, i.e. voice,
17
+ singing, or instrumental music. The -n (for noise) option uses the alternate
18
+ mode for un-pitched audio (e.g. percussion). Warning: -n introduces a power-gain
19
+ of about 11 dB in the filter, so beware of output clipping. band introduces noise
20
+ in the shape of the filter, i.e. peaking at the center frequency and settling
21
+ around it.
22
+
23
+ See also sinc for a bandpass filter with steeper shoulders.
24
+ """
25
+
26
+
27
+ def bandpass() -> str:
28
+ return """
29
+ bandpass [-c] frequency[k] width[h|k|o|q]
30
+ Apply a two-pole Butterworth band-pass filter with central frequency frequency,
31
+ and (3 dB-point) band-width width. The -c option applies only to bandpass and
32
+ selects a constant skirt gain (peak gain = Q) instead of the default: constant
33
+ 0 dB peak gain. The filters roll off at 6 dB per octave (20 dB per decade).
34
+
35
+ See also sinc for a bandpass filter with steeper shoulders.
36
+ """
37
+
38
+
39
+ def bandreject() -> str:
40
+ return """
41
+ bandreject [-c] frequency[k] width[h|k|o|q]
42
+ Apply a two-pole Butterworth band-reject filter with central frequency frequency,
43
+ and (3 dB-point) band-width width. The -c option applies only to bandpass and
44
+ selects a constant skirt gain (peak gain = Q) instead of the default: constant
45
+ 0 dB peak gain. The filters roll off at 6 dB per octave (20 dB per decade).
46
+
47
+ See also sinc for a bandpass filter with steeper shoulders.
48
+ """
49
+
50
+
51
+ def bass() -> str:
52
+ return """
53
+ bass gain [frequency[k] [width[s|h|k|o|q]]]
54
+ Boost or cut the bass (lower) frequencies of the audio using a two-pole shelving
55
+ filter with a response similar to that of a standard hi-fi's tone-controls. This
56
+ is also known as shelving equalisation (EQ).
57
+
58
+ gain gives the gain at 0 Hz. Its useful range is about -20 (for a large cut) to
59
+ +20 (for a large boost). Beware of Clipping when using a positive gain.
60
+
61
+ If desired, the filter can be fine-tuned using the following optional parameters:
62
+
63
+ frequency sets the filter's central frequency and so can be used to extend or reduce
64
+ the frequency range to be boosted or cut. The default value is 100 Hz.
65
+
66
+ width determines how steep is the filter's shelf transition. In addition to the
67
+ common width specification methods described above, 'slope' (the default, or if
68
+ appended with 's') may be used. The useful range of 'slope' is about 0.3, for a
69
+ gentle slope, to 1 (the maximum), for a steep slope; the default value is 0.5.
70
+
71
+ See also equalizer for a peaking equalisation effect.
72
+ """
73
+
74
+
75
+ def biquad() -> str:
76
+ return """
77
+ biquad b0 b1 b2 a0 a1 a2
78
+ Apply a biquad IIR filter with the given coefficients. Where b* and a* are the numerator
79
+ and denominator coefficients respectively.
80
+
81
+ See https://en.wikipedia.org/wiki/Digital_biquad_filter (where a0 = 1).
82
+ """
83
+
84
+
85
+ def chorus() -> str:
86
+ return """
87
+ chorus gain-in gain-out <delay decay speed depth -s|-t>
88
+ Add a chorus effect to the audio. This can make a single vocal sound like a chorus,
89
+ but can also be applied to instrumentation.
90
+
91
+ Chorus resembles an echo effect with a short delay, but whereas with echo the delay
92
+ is constant, with chorus, it is varied using sinusoidal or triangular modulation. The
93
+ modulation depth defines the range the modulated delay is played before or after the
94
+ delay. Hence the delayed sound will sound slower or faster, that is the delayed sound
95
+ tuned around the original one, like in a chorus where some vocals are slightly off key.
96
+
97
+ Each four-tuple parameter delay/decay/speed/depth gives the delay in milliseconds and
98
+ the decay (relative to gain-in) with a modulation speed in Hz using depth in
99
+ milliseconds. The modulation is either sinusoidal (-s) or triangular (-t). Gain-out is
100
+ the volume of the output.
101
+
102
+ A typical delay is around 40 ms to 60 ms; the modulation speed is best near 0.25 Hz and
103
+ the modulation depth around 2 ms. For example, a single delay:
104
+
105
+ play guitar1.wav chorus 0.7 0.9 55 0.4 0.25 2 -t
106
+
107
+ Two delays of the original samples:
108
+
109
+ play guitar1.wav chorus 0.6 0.9 50 0.4 0.25 2 -t 60 0.32 0.4 1.3 -s
110
+
111
+ A fuller sounding chorus (with three additional delays):
112
+
113
+ play guitar1.wav chorus 0.5 0.9 50 0.4 0.25 2 -t 60 0.32 0.4 2.3 -t 40 0.3 0.3 1.3 -s
114
+ """
115
+
116
+
117
+ def compand() -> str:
118
+ return """
119
+ compand attack1,decay1{,attack2,decay2} [soft-knee-dB:]in-dB1[,out-dB1]{,in-dB2,out-dB2} [gain [initial-volume-dB [delay]]]
120
+ Compand (compress or expand) the dynamic range of the audio.
121
+
122
+ The attack and decay parameters (in seconds) determine the time over which the
123
+ instantaneous level of the input signal is averaged to determine its volume; attacks
124
+ refer to increases in volume and decays refer to decreases. For most situations, the
125
+ attack time (response to the music getting louder) should be shorter than the decay
126
+ time because the human ear is more sensitive to sudden loud music than sudden soft
127
+ music. Where more than one pair of attack/decay parameters are specified, each input
128
+ channel is companded separately and the number of pairs must agree with the number of
129
+ input channels. Typical values are 0.3,0.8 seconds.
130
+
131
+ The second parameter is a list of points on the compander's transfer function specified
132
+ in dB relative to the maximum possible signal amplitude. The input values must be in a
133
+ strictly increasing order but the transfer function does not have to be monotonically
134
+ rising. If omitted, the value of out-dB1 defaults to the same value as in-dB1; levels
135
+ below in-dB1 are not companded (but may have gain applied to them). The point 0,0 is
136
+ assumed but may be overridden (by 0,out-dBn). If the list is preceded by a soft-knee-dB
137
+ value, then the points at where adjacent line segments on the transfer function meet
138
+ will be rounded by the amount given. Typical values for the transfer function are
139
+ 6:-70,-60,-20.
140
+
141
+ The third (optional) parameter is an additional gain in dB to be applied at all points on
142
+ the transfer function and allows easy adjustment of the overall gain.
143
+
144
+ The fourth (optional) parameter is an initial level to be assumed for each channel when
145
+ companding starts. This permits the user to supply a nominal level initially, so that,
146
+ for example, a very large gain is not applied to initial signal levels before the
147
+ companding action has begun to operate: it is quite probable that in such an event,
148
+ the output would be severely clipped while the compander gain properly adjusts itself.
149
+ A typical value (for audio which is initially quiet) is -90 dB.
150
+
151
+ The fifth (optional) parameter is a delay in seconds. The input signal is analysed
152
+ immediately to control the compander, but it is delayed before being fed to the volume
153
+ adjuster. Specifying a delay approximately equal to the attack/decay times allows the
154
+ compander to effectively operate in a 'predictive' rather than a reactive mode. A
155
+ typical value is 0.2 seconds.
156
+
157
+ The following example might be used to make a piece of music with both quiet and loud
158
+ passages suitable for listening to in a noisy environment such as a moving vehicle:
159
+
160
+ sox asz.wav asz-car.wav compand 0.3,1 6:-70,-60,-20 -5 -90 0.2
161
+
162
+ The transfer function ('6:-70,...') says that very soft sounds (below -70 dB) will
163
+ remain unchanged. This will stop the compander from boosting the volume on 'silent'
164
+ passages such as between movements. However, sounds in the range -60 dB to 0 dB (maximum
165
+ volume) will be boosted so that the 60 dB dynamic range of the original music will be
166
+ compressed 3-to-1 into a 20 dB range, which is wide enough to enjoy the music but narrow
167
+ enough to get around the road noise. The '6:' selects 6 dB soft-knee companding. The
168
+ -5 (dB) output gain is needed to avoid clipping (the number is inexact, and was derived
169
+ by experimentation). The -90 (dB) for the initial volume will work fine for a clip that
170
+ starts with near silence, and the delay of 0.2 (seconds) has the effect of causing the
171
+ compander to react a bit more quickly to sudden volume changes.
172
+
173
+ In the next example, compand is being used as a noise-gate for when the noise is at a
174
+ lower level than the signal:
175
+
176
+ play infile compand .1,.2 -inf,-50.1,-inf,-50,-50 0 -90 .1
177
+
178
+ Here is another noise-gate, this time for when the noise is at a higher level than the
179
+ signal (making it, in some ways, similar to squelch):
180
+
181
+ play infile compand .1,.1 -45.1,-45,-inf,0,-inf 45 -90 .1
182
+
183
+ See also mcompand for a multiple-band companding effect.
184
+ """
185
+
186
+
187
+ def contrast() -> str:
188
+ return """
189
+ contrast [enhancement-amount(75)]
190
+ Comparable with compression, this effect modifies an audio signal to make it sound louder.
191
+ enhancement-amount controls the amount of the enhancement and is a number in the range 0-100.
192
+ Note that enhancement-amount = 0 still gives a significant contrast enhancement.
193
+
194
+ See also the compand and mcompand effects.
195
+ """
196
+
197
+
198
+ def dcshift() -> str:
199
+ return """
200
+ dcshift shift [limitergain]
201
+ Apply a DC shift to the audio. This can be useful to remove a DC offset (caused perhaps by
202
+ a hardware problem in the recording chain) from the audio. The effect of a DC offset is
203
+ reduced headroom and hence volume. The stat or stats effect can be used to determine if a
204
+ signal has a DC offset.
205
+
206
+ The given dcshift value is a floating point number in the range of +/-2 that indicates the
207
+ amount to shift the audio (which is in the range of +/-1).
208
+
209
+ An optional limitergain can be specified as well. It should have a value much less than 1
210
+ (e.g. 0.05 or 0.02) and is used only on peaks to prevent clipping.
211
+
212
+ An alternative approach to removing a DC offset (albeit with a short delay) is to use the
213
+ highpass filter effect at a frequency of say 10 Hz, as illustrated in the following example:
214
+
215
+ sox -n dc.wav synth 5 sin %0 50
216
+ sox dc.wav fixed.wav highpass 10
217
+ """
218
+
219
+
220
+ def equalizer() -> str:
221
+ return """
222
+ equalizer frequency[k] width[q|o|h|k] gain
223
+ Apply a two-pole peaking equalisation (EQ) filter. With this filter, the signal-level at and
224
+ around a selected frequency can be increased or decreased, whilst (unlike band-pass and
225
+ band-reject filters) that at all other frequencies is unchanged.
226
+
227
+ frequency gives the filter's central frequency in Hz, width, the band-width, and gain the
228
+ required gain or attenuation in dB. Beware of Clipping when using a positive gain.
229
+
230
+ In order to produce complex equalisation curves, this effect can be given several times, each
231
+ with a different central frequency.
232
+
233
+ See also bass and treble for shelving equalisation effects.
234
+ """
235
+
236
+
237
+ def flanger() -> str:
238
+ return """
239
+ flanger [delay depth regen width speed shape phase interp]
240
+ Apply a flanging effect to the audio.
241
+
242
+ All parameters are optional (right to left).
243
+ """
244
+
245
+
246
+ def gain() -> str:
247
+ return """
248
+ gain [-e|-B|-b|-r] [-n] [-l|-h] [gain-dB]
249
+ Apply amplification or attenuation to the audio signal, or, in some cases, to some of its
250
+ channels. Note that use of any of -e, -B, -b, -r, or -n requires temporary file space to store
251
+ the audio to be processed, so may be unsuitable for use with 'streamed' audio.
252
+
253
+ Without other options, gain-dB is used to adjust the signal power level by the given number of
254
+ dB: positive amplifies (beware of Clipping), negative attenuates. With other options, the
255
+ gain-dB amplification or attenuation is (logically) applied after the processing due to those
256
+ options.
257
+
258
+ Given the -e option, the levels of the audio channels of a multi-channel file are 'equalised',
259
+ i.e. gain is applied to all channels other than that with the highest peak level, such that all
260
+ channels attain the same peak level (but, without also giving -n, the audio is not 'normalised').
261
+
262
+ The -B (balance) option is similar to -e, but with -B, the RMS level is used instead of the peak
263
+ level. -B might be used to correct stereo imbalance caused by an imperfect record turntable
264
+ cartridge. Note that unlike -e, -B might cause some clipping.
265
+
266
+ -b is similar to -B but has clipping protection, i.e. if necessary to prevent clipping whilst
267
+ balancing, attenuation is applied to all channels. Note, however, that in conjunction with
268
+ -n, -B and -b are synonymous.
269
+
270
+ The -r option is used in conjunction with a prior invocation of gain with the -h option - see below
271
+ for details.
272
+
273
+ The -n option normalises the audio to 0 dB FSD; it is often used in conjunction with a negative
274
+ gain-dB to the effect that the audio is normalised to a given level below 0 dB. For example,
275
+
276
+ sox infile outfile gain -n
277
+
278
+ normalises to 0 dB, and
279
+
280
+ sox infile outfile gain -n -3
281
+
282
+ normalises to -3 dB.
283
+
284
+ The -l option invokes a simple limiter, e.g.
285
+
286
+ sox infile outfile gain -l 6
287
+
288
+ will apply 6 dB of gain but never clip. Note that limiting more than a few dBs more than
289
+ occasionally (in a piece of audio) is not recommended as it can cause audible distortion. See
290
+ the compand effect for a more capable limiter.
291
+
292
+ The -h option is used to apply gain to provide head-room for subsequent processing.
293
+ For example, with
294
+
295
+ sox infile outfile gain -h bass +6
296
+
297
+ 6 dB of attenuation will be applied prior to the bass boosting effect thus ensuring that it will
298
+ not clip. Of course, with bass, it is obvious how much headroom will be needed, but with other
299
+ effects (e.g. rate, dither) it is not always as clear. Another advantage of using gain -h rather
300
+ than an explicit attenuation, is that if the headroom is not used by subsequent effects, it can
301
+ be reclaimed with gain -r, for example:
302
+
303
+ sox infile outfile gain -h bass +6 rate 44100 gain -r
304
+
305
+ The above effects chain guarantees never to clip nor amplify; it attenuates if necessary to
306
+ prevent clipping, but by only as much as is needed to do so.
307
+
308
+ Output formatting (dithering and bit-depth reduction) also requires headroom (which cannot be
309
+ 'reclaimed'), e.g.
310
+
311
+ sox infile outfile gain -h bass +6 rate 44100 gain -rh dither
312
+
313
+ Here, the second gain invocation, reclaims as much of the headroom as it can from the preceding
314
+ effects, but retains as much headroom as is needed for subsequent processing.
315
+
316
+ See also the norm and vol effects.
317
+ """
318
+
319
+
320
+ def highpass() -> str:
321
+ return """
322
+ highpass [-1|-2] frequency[k] [width[q|o|h|k]]
323
+ Apply a high-pass filter with 3 dB point frequency. The filter can be either single-pole (with -1),
324
+ or double-pole (the default, or with -2). width applies only to double-pole filters; the default
325
+ is Q = 0.707 and gives a Butterworth response. The filters roll off at 6 dB per pole per octave
326
+ (20 dB per pole per decade).
327
+
328
+ See also sinc for filters with a steeper roll-off.
329
+ """
330
+
331
+
332
+ def hilbert() -> str:
333
+ return """
334
+ hilbert [-n taps]
335
+ Apply an odd-tap Hilbert transform filter, phase-shifting the signal by 90 degrees.
336
+
337
+ This is used in many matrix coding schemes and for analytic signal generation. The process is
338
+ often written as a multiplication by i (or j), the imaginary unit.
339
+
340
+ An odd-tap Hilbert transform filter has a bandpass characteristic, attenuating the lowest and
341
+ highest frequencies. Its bandwidth can be controlled by the number of filter taps, which can be
342
+ specified with -n. By default, the number of taps is chosen for a cutoff frequency of about 75 Hz.
343
+ """
344
+
345
+
346
+ def loudness() -> str:
347
+ return """
348
+ loudness [gain [reference]]
349
+ Loudness control - similar to the gain effect, but provides equalisation for the human auditory
350
+ system. See https://en.wikipedia.org/wiki/Loudness for a detailed description of loudness. The
351
+ gain is adjusted by the given gain parameter (usually negative) and the signal equalised
352
+ according to ISO 226 w.r.t. a reference level of 65 dB, though an alternative reference level
353
+ may be given if the original audio has been equalised for some other optimal level. A default
354
+ gain of -10 dB is used if a gain value is not given.
355
+
356
+ See also the gain effect.
357
+ """
358
+
359
+
360
+ def lowpass() -> str:
361
+ return """
362
+ lowpass [-1|-2] frequency[k] [width[q|o|h|k]]
363
+ Apply a low-pass filter with 3 dB point frequency. The filter can be either single-pole (with -1),
364
+ or double-pole (the default, or with -2). width applies only to double-pole filters; the default
365
+ is Q = 0.707 and gives a Butterworth response. The filters roll off at 6 dB per pole per octave
366
+ (20 dB per pole per decade).
367
+
368
+ See also sinc for filters with a steeper roll-off.
369
+ """
370
+
371
+
372
+ def mcompand() -> str:
373
+ return """
374
+ mcompand "attack1,decay1{,attack2,decay2} [soft-knee-dB:]in-dB1[,out-dB1]{,in-dB2,out-dB2} [gain [initial-volume-dB [delay]]]" {crossover-freq[k] "attack1,..."}
375
+ The multi-band compander is similar to the single-band compander but the audio is first divided
376
+ into bands using Linkwitz-Riley cross-over filters and a separately specifiable compander run on
377
+ each band. See the compand effect for the definition of its parameters. Compand parameters are
378
+ specified between double quotes and the crossover frequency for that band is given by
379
+ crossover-freq; these can be repeated to create multiple bands.
380
+
381
+ For example, the following (one long) command shows how multi-band companding is typically used
382
+ in FM radio:
383
+
384
+ play track1.wav gain -3 sinc 8000- 29 100 mcompand \
385
+ "0.005,0.1 -47,-40,-34,-34,-17,-33" 100 \
386
+ "0.003,0.05 -47,-40,-34,-34,-17,-33" 400 \
387
+ "0.000625,0.0125 -47,-40,-34,-34,-15,-33" 1600 \
388
+ "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30" 6400 \
389
+ "0,0.025 -38,-31,-28,-28,-0,-25" \
390
+ gain 15 highpass 22 highpass 22 sinc -n 255 -b 16 -17500 \
391
+ gain 9 lowpass -1 17801
392
+
393
+ The audio file is played with a simulated FM radio sound (or broadcast signal condition if the
394
+ lowpass filter at the end is skipped). Note that the pipeline is set up with US-style 75 us
395
+ pre-emphasis.
396
+
397
+ See also compand for a single-band companding effect.
398
+ """
399
+
400
+
401
+ def norm() -> str:
402
+ return """
403
+ norm [dB-level]
404
+ Normalise the audio. norm is just an alias for gain -n; see the gain effect for details.
405
+ """
406
+
407
+
408
+ def overdrive() -> str:
409
+ return """
410
+ overdrive [gain(20) [colour(20)]]
411
+ Non linear distortion. The colour parameter controls the amount of even harmonic content in the
412
+ over-driven output.
413
+ """
414
+
415
+
416
+ def phaser() -> str:
417
+ return """
418
+ phaser gain-in gain-out delay decay speed [-s|-t]
419
+ Add a phasing effect to the audio.
420
+
421
+ delay/decay/speed gives the delay in milliseconds and the decay (relative to gain-in) with a
422
+ modulation speed in Hz. The modulation is either sinusoidal (-s) - preferable for multiple
423
+ instruments, or triangular (-t) - gives single instruments a sharper phasing effect. The decay
424
+ should be less than 0.5 to avoid feedback, and usually no less than 0.1. Gain-out is the
425
+ volume of the output.
426
+
427
+ For example:
428
+
429
+ play snare.flac phaser 0.8 0.74 3 0.4 0.5 -t
430
+
431
+ Gentler:
432
+
433
+ play snare.flac phaser 0.9 0.85 4 0.23 1.3 -s
434
+
435
+ A popular sound:
436
+
437
+ play snare.flac phaser 0.89 0.85 1 0.24 2 -t
438
+
439
+ More severe:
440
+
441
+ play snare.flac phaser 0.6 0.66 3 0.6 2 -t
442
+ """
443
+
444
+
445
+ def pitch() -> str:
446
+ return """
447
+ pitch [-q] shift [segment [search [overlap]]]
448
+ Change the audio pitch (but not tempo).
449
+
450
+ shift gives the pitch shift as positive or negative 'cents' (i.e. 100ths of a semitone). See
451
+ the tempo effect for a description of the other parameters.
452
+
453
+ See also the bend, speed, and tempo effects.
454
+ """
455
+
456
+
457
+ def reverb() -> str:
458
+ return """
459
+ reverb [-w|--wet-only] [reverberance (50%) [HF-damping (50%) [room-scale (100%) [stereo-depth (100%) [pre-delay (0 ms) [wet-gain (0 dB)]]]]]]
460
+ Add reverberation to the audio using the 'freeverb' algorithm. A reverberation effect is
461
+ sometimes desirable for concert halls that are too small or contain so many people that the
462
+ hall's natural reverberance is diminished. Applying a small amount of stereo reverb to a (dry)
463
+ mono signal will usually make it sound more natural. See [3] for a detailed description of
464
+ reverberation.
465
+
466
+ Note that this effect increases both the volume and the length of the audio, so to prevent
467
+ clipping in these domains, a typical invocation might be:
468
+
469
+ play dry.wav gain -3 pad 0 3 reverb
470
+
471
+ The -w option can be given to select only the 'wet' signal, thus allowing it to be processed
472
+ further, independently of the 'dry' signal. E.g.
473
+
474
+ play -m voice.wav "|sox voice.wav -p reverse reverb -w reverse"
475
+
476
+ for a reverse reverb effect.
477
+ """
478
+
479
+
480
+ def sinc() -> str:
481
+ return """
482
+ sinc [-a att|-b beta] [-p phase|-M|-I|-L] [-t tbw|-n taps] [freqHP] [-freqLP [-t tbw|-n taps]]
483
+ Apply a sinc kaiser-windowed low-pass, high-pass, band-pass, or band-reject filter to the
484
+ signal. The freqHP and freqLP parameters give the frequencies of the 6 dB points of a
485
+ high-pass and low-pass filter that may be invoked individually, or together. If both are
486
+ given, then freqHP less than freqLP creates a band-pass filter, freqHP greater than freqLP
487
+ creates a band-reject filter. For example, the invocations
488
+
489
+ sinc 3k
490
+ sinc -4k
491
+ sinc 3k-4k
492
+ sinc 4k-3k
493
+
494
+ create a high-pass, low-pass, band-pass, and band-reject filter respectively.
495
+
496
+ The default stop-band attenuation of 120 dB can be overridden with -a; alternatively, the
497
+ kaiser-window 'beta' parameter can be given directly with -b.
498
+
499
+ The default transition band-width of 5% of the total band can be overridden with -t (and
500
+ tbw in Hertz); alternatively, the number of filter taps can be given directly with -n.
501
+
502
+ If both freqHP and freqLP are given, then a -t or -n option given to the left of the
503
+ frequencies applies to both frequencies; one of these options given to the right of the
504
+ frequencies applies only to freqLP.
505
+
506
+ The -p, -M, -I, and -L options control the filter's phase response; see the rate effect
507
+ for details.
508
+ """
509
+
510
+
511
+ def speed() -> str:
512
+ return """
513
+ speed factor[c]
514
+ Adjust the audio speed (pitch and tempo together). factor is either the ratio of the new
515
+ speed to the old speed: greater than 1 speeds up, less than 1 slows down, or, if appended
516
+ with the letter 'c', the number of cents (i.e. 100ths of a semitone) by which the pitch
517
+ (and tempo) should be adjusted: greater than 0 increases, less than 0 decreases.
518
+
519
+ Technically, the speed effect only changes the sample rate information, leaving the samples
520
+ themselves untouched. The rate effect is invoked automatically to resample to the output
521
+ sample rate, using its default quality/speed. For higher quality or higher speed
522
+ resampling, in addition to the speed effect, specify the rate effect with the desired
523
+ quality option.
524
+
525
+ See also the bend, pitch, and tempo effects.
526
+ """
527
+
528
+
529
+ def tempo() -> str:
530
+ return """
531
+ tempo [-q] [-m|-s|-l] factor [segment [search [overlap]]]
532
+ Change the audio playback speed but not its pitch. This effect uses the WSOLA algorithm. The
533
+ audio is chopped up into segments which are then shifted in the time domain and overlapped
534
+ (cross-faded) at points where their waveforms are most similar as determined by measurement
535
+ of 'least squares'.
536
+
537
+ By default, linear searches are used to find the best overlapping points. If the optional -q
538
+ parameter is given, tree searches are used instead. This makes the effect work more quickly,
539
+ but the result may not sound as good. However, if you must improve the processing speed,
540
+ this generally reduces the sound quality less than reducing the search or overlap values.
541
+
542
+ The -m option is used to optimize default values of segment, search and overlap for music
543
+ processing.
544
+
545
+ The -s option is used to optimize default values of segment, search and overlap for speech
546
+ processing.
547
+
548
+ The -l option is used to optimize default values of segment, search and overlap for 'linear'
549
+ processing that tends to cause more noticeable distortion but may be useful when factor is
550
+ close to 1.
551
+
552
+ If -m, -s, or -l is specified, the default value of segment will be calculated based on
553
+ factor, while default search and overlap values are based on segment. Any values you provide
554
+ still override these default values.
555
+
556
+ factor gives the ratio of new tempo to the old tempo, so e.g. 1.1 speeds up the tempo by
557
+ 10%, and 0.9 slows it down by 10%.
558
+
559
+ The optional segment parameter selects the algorithm's segment size in milliseconds. If no
560
+ other flags are specified, the default value is 82 and is typically suited to making small
561
+ changes to the tempo of music. For larger changes (e.g. a factor of 2), 41 ms may give a
562
+ better result. The -m, -s, and -l flags will cause the segment default to be automatically
563
+ adjusted based on factor. For example using -s (for speech) with a tempo of 1.25 will
564
+ calculate a default segment value of 32.
565
+
566
+ The optional search parameter gives the audio length in milliseconds over which the algorithm
567
+ will search for overlapping points. If no other flags are specified, the default value is
568
+ 14.68. Larger values use more processing time and may or may not produce better results. A
569
+ practical maximum is half the value of segment. Search can be reduced to cut processing time
570
+ at the risk of degrading output quality. The -m, -s, and -l flags will cause the search
571
+ default to be automatically adjusted based on segment.
572
+
573
+ The optional overlap parameter gives the segment overlap length in milliseconds. Default
574
+ value is 12, but -m, -s, or -l flags automatically adjust overlap based on segment size.
575
+ Increasing overlap increases processing time and may increase quality. A practical maximum
576
+ for overlap is the value of search, with overlap typically being (at least) a little
577
+ smaller then search.
578
+
579
+ See also speed for an effect that changes tempo and pitch together, pitch and bend for
580
+ effects that change pitch only, and stretch for an effect that changes tempo using a
581
+ different algorithm.
582
+ """
583
+
584
+
585
+ def treble() -> str:
586
+ return """
587
+ treble gain [frequency[k] [width[s|h|k|o|q]]]
588
+ Boost or cut the treble (upper) frequencies of the audio using a two-pole shelving
589
+ filter with a response similar to that of a standard hi-fi's tone-controls. This is
590
+ also known as shelving equalisation (EQ).
591
+
592
+ gain gives the gain at whichever is the lower of ~22 kHz and the Nyquist frequency.
593
+ Its useful range is about -20 (for a large cut) to +20 (for a large boost). Beware
594
+ of Clipping when using a positive gain.
595
+
596
+ If desired, the filter can be fine-tuned using the following optional parameters:
597
+
598
+ frequency sets the filter's central frequency and so can be used to extend or reduce
599
+ the frequency range to be boosted or cut. The default value is 3 kHz.
600
+
601
+ width determines how steep is the filter's shelf transition. In addition to the
602
+ common width specification methods described above, 'slope' (the default, or if
603
+ appended with 's') may be used. The useful range of 'slope' is about 0.3, for a
604
+ gentle slope, to 1 (the maximum), for a steep slope; the default value is 0.5.
605
+
606
+ See also equalizer for a peaking equalisation effect.
607
+ """
608
+
609
+
610
+ def tremolo() -> str:
611
+ return """
612
+ tremolo speed [depth]
613
+ Apply a tremolo (low frequency amplitude modulation) effect to the audio. The
614
+ tremolo frequency in Hz is given by speed, and the depth as a percentage by
615
+ depth (default 40).
616
+ """
617
+
618
+
619
+ def vol() -> str:
620
+ return """
621
+ vol gain [type [limitergain]]
622
+ Apply an amplification or an attenuation to the audio signal. Unlike the -v option
623
+ (which is used for balancing multiple input files as they enter the sox effects
624
+ processing chain), vol is an effect like any other so can be applied anywhere, and
625
+ several times if necessary, during the processing chain.
626
+
627
+ The amount to change the volume is given by gain which is interpreted, according
628
+ to the given type, as follows: if type is amplitude (or is omitted), then gain is
629
+ an amplitude (i.e. voltage or linear) ratio, if power, then a power (i.e. wattage
630
+ or voltage-squared) ratio, and if dB, then a power change in dB.
631
+
632
+ When type is amplitude or power, a gain of 1 leaves the volume unchanged, less
633
+ than 1 decreases it, and greater than 1 increases it; a negative gain inverts the
634
+ audio signal in addition to adjusting its volume.
635
+
636
+ When type is dB, a gain of 0 leaves the volume unchanged, less than 0 decreases
637
+ it, and greater than 0 increases it.
638
+
639
+ Beware of Clipping when the increasing the volume.
640
+
641
+ The gain and the type parameters can be concatenated if desired, e.g. vol 10 dB.
642
+
643
+ An optional limitergain value can be specified and should be a value much less
644
+ than 1 (e.g. 0.05 or 0.02) and is used only on peaks to prevent clipping. Not
645
+ specifying this parameter will cause no limiter to be used. In verbose mode,
646
+ this effect will display the percentage of the audio that needed to be limited.
647
+
648
+ See also gain for a volume-changing effect with different capabilities, and
649
+ compand for a dynamic-range compression/expansion/limiting effect.
650
+ """
@@ -1,5 +1,5 @@
1
- from sonusai.mixture.datatypes import AudioF
2
- from sonusai.mixture.datatypes import SpectralMask
1
+ from ..datatypes import AudioF
2
+ from ..datatypes import SpectralMask
3
3
 
4
4
 
5
5
  def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int | None = None) -> AudioF: