lt-tensor 0.0.1a35__py3-none-any.whl → 0.0.1a37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lt_tensor/__init__.py +1 -1
- lt_tensor/model_zoo/audio_models/bigvgan/__init__.py +10 -10
- lt_tensor/model_zoo/audio_models/hifigan/__init__.py +6 -10
- lt_tensor/model_zoo/losses/CQT/__init__.py +0 -0
- lt_tensor/model_zoo/losses/CQT/transforms.py +336 -0
- lt_tensor/model_zoo/losses/CQT/utils.py +519 -0
- lt_tensor/model_zoo/losses/discriminators.py +232 -0
- lt_tensor/processors/audio.py +275 -123
- {lt_tensor-0.0.1a35.dist-info → lt_tensor-0.0.1a37.dist-info}/METADATA +2 -2
- {lt_tensor-0.0.1a35.dist-info → lt_tensor-0.0.1a37.dist-info}/RECORD +13 -10
- {lt_tensor-0.0.1a35.dist-info → lt_tensor-0.0.1a37.dist-info}/WHEEL +0 -0
- {lt_tensor-0.0.1a35.dist-info → lt_tensor-0.0.1a37.dist-info}/licenses/LICENSE +0 -0
- {lt_tensor-0.0.1a35.dist-info → lt_tensor-0.0.1a37.dist-info}/top_level.txt +0 -0
lt_tensor/processors/audio.py
CHANGED
@@ -10,6 +10,7 @@ from lt_utils.type_utils import is_file, is_array
|
|
10
10
|
from lt_utils.file_ops import FileScan, get_file_name, path_to_str
|
11
11
|
from torchaudio.functional import detect_pitch_frequency
|
12
12
|
import torch.nn.functional as F
|
13
|
+
from librosa.filters import mel as _mel_filter_bank
|
13
14
|
|
14
15
|
DEFAULT_DEVICE = torch.tensor([0]).device
|
15
16
|
|
@@ -25,7 +26,7 @@ class AudioProcessorConfig(ModelConfig):
|
|
25
26
|
f_min: float = 0
|
26
27
|
f_max: Optional[float] = None
|
27
28
|
center: bool = True
|
28
|
-
mel_scale: Literal["htk" "slaney"] = "htk"
|
29
|
+
mel_scale: Literal["htk", "slaney"] = "htk"
|
29
30
|
std: int = 4
|
30
31
|
mean: int = -4
|
31
32
|
n_iter: int = 32
|
@@ -33,6 +34,7 @@ class AudioProcessorConfig(ModelConfig):
|
|
33
34
|
normalized: bool = False
|
34
35
|
onesided: Optional[bool] = None
|
35
36
|
n_stft: int = None
|
37
|
+
mel_default: Literal["torch", "librosa"] = "librosa"
|
36
38
|
|
37
39
|
def __init__(
|
38
40
|
self,
|
@@ -49,6 +51,7 @@ class AudioProcessorConfig(ModelConfig):
|
|
49
51
|
mean: int = -4,
|
50
52
|
normalized: bool = False,
|
51
53
|
onesided: Optional[bool] = None,
|
54
|
+
mel_default: Literal["torch", "librosa"] = "librosa",
|
52
55
|
*args,
|
53
56
|
**kwargs,
|
54
57
|
):
|
@@ -66,6 +69,7 @@ class AudioProcessorConfig(ModelConfig):
|
|
66
69
|
"mean": mean,
|
67
70
|
"normalized": normalized,
|
68
71
|
"onesided": onesided,
|
72
|
+
"mel_default": mel_default,
|
69
73
|
}
|
70
74
|
super().__init__(**settings)
|
71
75
|
self.post_process()
|
@@ -73,7 +77,7 @@ class AudioProcessorConfig(ModelConfig):
|
|
73
77
|
def post_process(self):
|
74
78
|
self.n_stft = self.n_fft // 2 + 1
|
75
79
|
# some functions needs this to be a non-zero or not None value.
|
76
|
-
self.f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1))
|
80
|
+
self.f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1)) * 2)
|
77
81
|
self.default_f_max = min(
|
78
82
|
default(self.f_max, self.sample_rate // 2), self.sample_rate // 2
|
79
83
|
)
|
@@ -88,14 +92,10 @@ def _comp_rms_helper(i: int, audio: Tensor, mel: Optional[Tensor]):
|
|
88
92
|
|
89
93
|
|
90
94
|
class AudioProcessor(Model):
|
91
|
-
def __init__(
|
92
|
-
self,
|
93
|
-
config: AudioProcessorConfig = AudioProcessorConfig(),
|
94
|
-
window: Optional[Tensor] = None,
|
95
|
-
):
|
95
|
+
def __init__(self, config: AudioProcessorConfig = AudioProcessorConfig()):
|
96
96
|
super().__init__()
|
97
97
|
self.cfg = config
|
98
|
-
self.
|
98
|
+
self._mel_spec_torch = torchaudio.transforms.MelSpectrogram(
|
99
99
|
sample_rate=self.cfg.sample_rate,
|
100
100
|
n_mels=self.cfg.n_mels,
|
101
101
|
n_fft=self.cfg.n_fft,
|
@@ -105,9 +105,9 @@ class AudioProcessor(Model):
|
|
105
105
|
f_min=self.cfg.f_min,
|
106
106
|
f_max=self.cfg.f_max,
|
107
107
|
mel_scale=self.cfg.mel_scale,
|
108
|
-
onesided=self.cfg.onesided,
|
109
108
|
normalized=self.cfg.normalized,
|
110
109
|
)
|
110
|
+
|
111
111
|
self._mel_rscale = torchaudio.transforms.InverseMelScale(
|
112
112
|
n_stft=self.cfg.n_stft,
|
113
113
|
n_mels=self.cfg.n_mels,
|
@@ -116,36 +116,119 @@ class AudioProcessor(Model):
|
|
116
116
|
f_max=self.cfg.f_max,
|
117
117
|
mel_scale=self.cfg.mel_scale,
|
118
118
|
)
|
119
|
-
|
119
|
+
self.mel_lib_padding = (self.cfg.n_fft - self.cfg.hop_length) // 2
|
120
120
|
self.register_buffer(
|
121
121
|
"window",
|
122
|
-
|
122
|
+
torch.hann_window(self.cfg.win_length),
|
123
123
|
)
|
124
|
+
self.register_buffer(
|
125
|
+
"mel_filter_bank",
|
126
|
+
torch.from_numpy(
|
127
|
+
_mel_filter_bank(
|
128
|
+
sr=self.cfg.sample_rate,
|
129
|
+
n_fft=self.cfg.n_fft,
|
130
|
+
n_mels=self.cfg.n_mels,
|
131
|
+
fmin=self.cfg.f_min,
|
132
|
+
fmax=self.cfg.f_max,
|
133
|
+
)
|
134
|
+
).float(),
|
135
|
+
)
|
136
|
+
|
137
|
+
def spectral_norm(self, x: Tensor, c: int = 1, eps: float = 1e-5):
|
138
|
+
return torch.log(torch.clamp(x, min=eps) * c)
|
124
139
|
|
140
|
+
def spectral_de_norm(self, x: Tensor, c: int = 1):
|
141
|
+
return torch.exp(x) / c
|
142
|
+
|
143
|
+
def log_norm(
|
144
|
+
self,
|
145
|
+
entry: Tensor,
|
146
|
+
eps: float = 1e-5,
|
147
|
+
mean: Optional[Number] = None,
|
148
|
+
std: Optional[Number] = None,
|
149
|
+
) -> Tensor:
|
150
|
+
mean = default(mean, self.cfg.mean)
|
151
|
+
std = default(std, self.cfg.std)
|
152
|
+
return (torch.log(eps + entry.unsqueeze(0)) - mean) / std
|
125
153
|
|
126
|
-
|
127
154
|
def compute_mel(
|
128
155
|
self,
|
129
156
|
wave: Tensor,
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
157
|
+
method: Optional[Literal["torch", "librosa"]] = None,
|
158
|
+
apply_norm: bool = False,
|
159
|
+
eps: Optional[float] = None,
|
160
|
+
**kwargs,
|
161
|
+
) -> Tensor:
|
162
|
+
method = default(method, self.cfg.mel_default)
|
163
|
+
if method == "torch":
|
164
|
+
return self.compute_mel_torch(
|
165
|
+
wave,
|
166
|
+
log_norm=apply_norm,
|
167
|
+
eps=eps,
|
168
|
+
mean=kwargs.get("mean", None),
|
169
|
+
std=kwargs.get("std", None),
|
170
|
+
)
|
171
|
+
return self.compute_mel_librosa(
|
172
|
+
wave,
|
173
|
+
log_norm=apply_norm,
|
174
|
+
eps=eps,
|
175
|
+
)
|
176
|
+
|
177
|
+
def compute_mel_torch(
|
178
|
+
self,
|
179
|
+
wave: Tensor,
|
180
|
+
log_norm: bool = False,
|
181
|
+
eps: Optional[float] = None,
|
182
|
+
mean: Optional[Number] = None,
|
183
|
+
std: Optional[Number] = None,
|
184
|
+
*args,
|
185
|
+
**kwargs,
|
134
186
|
) -> Tensor:
|
135
|
-
"""Returns:
|
187
|
+
"""Returns: (M, T) or (B, M, T) if batched"""
|
136
188
|
try:
|
137
|
-
mel_tensor = self.
|
138
|
-
if not raw_mel_only:
|
139
|
-
mel_tensor = (
|
140
|
-
torch.log(eps + mel_tensor.unsqueeze(0)) - self.cfg.mean
|
141
|
-
) / self.cfg.std
|
142
|
-
return mel_tensor.squeeze()
|
189
|
+
mel_tensor = self._mel_spec_torch.forward(wave.to(self.device)) # [M, T]
|
143
190
|
|
144
191
|
except RuntimeError as e:
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
192
|
+
mel_tensor = self._mel_spec_torch.forward(wave.to(self.device)) # [M, T]
|
193
|
+
if log_norm:
|
194
|
+
return self.log_norm(mel_tensor, eps, mean, std).squeeze()
|
195
|
+
return mel_tensor.squeeze()
|
196
|
+
|
197
|
+
def compute_mel_librosa(
|
198
|
+
self,
|
199
|
+
wave: Tensor,
|
200
|
+
eps: float = 1e-5,
|
201
|
+
spectral_norm: bool = False,
|
202
|
+
*args,
|
203
|
+
**kwargs,
|
204
|
+
):
|
205
|
+
wave = torch.nn.functional.pad(
|
206
|
+
wave.unsqueeze(1),
|
207
|
+
(self.mel_lib_padding, self.mel_lib_padding),
|
208
|
+
mode="reflect",
|
209
|
+
).squeeze(1)
|
210
|
+
spec = torch.stft(
|
211
|
+
wave,
|
212
|
+
self.cfg.n_fft,
|
213
|
+
hop_length=self.cfg.hop_length,
|
214
|
+
win_length=self.cfg.win_length,
|
215
|
+
window=self.window,
|
216
|
+
center=self.cfg.center,
|
217
|
+
pad_mode="reflect",
|
218
|
+
normalized=False,
|
219
|
+
onesided=True,
|
220
|
+
return_complex=True,
|
221
|
+
)
|
222
|
+
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-12)
|
223
|
+
try:
|
224
|
+
results = torch.matmul(self.mel_filter_bank, spec)
|
225
|
+
except RuntimeError:
|
226
|
+
self.mel_filter_bank = self.mel_filter_bank.to(self.device)
|
227
|
+
self.window = self.window.to(self.device)
|
228
|
+
results = torch.matmul(self.mel_filter_bank, spec)
|
229
|
+
if spectral_norm:
|
230
|
+
return self.spectral_norm(results, eps=eps).squeeze()
|
231
|
+
return results.squeeze()
|
149
232
|
|
150
233
|
def compute_inverse_mel(self, melspec: Tensor, *, _recall=False):
|
151
234
|
try:
|
@@ -203,13 +286,16 @@ class AudioProcessor(Model):
|
|
203
286
|
rms_ = []
|
204
287
|
for i in range(B):
|
205
288
|
_t = _comp_rms_helper(i, audio, mel)
|
206
|
-
_r = librosa.feature.rms(**_t, **rms_kwargs)[
|
207
|
-
0
|
208
|
-
]
|
289
|
+
_r = librosa.feature.rms(**_t, **rms_kwargs)[0]
|
209
290
|
rms_.append(_r)
|
210
291
|
return self.from_numpy_batch(rms_, default_device, default_dtype).squeeze()
|
211
292
|
|
212
|
-
def pitch_shift(
|
293
|
+
def pitch_shift(
|
294
|
+
self,
|
295
|
+
audio: torch.Tensor,
|
296
|
+
sample_rate: Optional[int] = None,
|
297
|
+
n_steps: float = 2.0,
|
298
|
+
):
|
213
299
|
"""
|
214
300
|
Shifts the pitch of an audio tensor by `n_steps` semitones.
|
215
301
|
|
@@ -225,21 +311,25 @@ class AudioProcessor(Model):
|
|
225
311
|
src_dtype = audio.dtype
|
226
312
|
audio = audio.squeeze()
|
227
313
|
sample_rate = default(sample_rate, self.cfg.sample_rate)
|
314
|
+
|
228
315
|
def _shift_one(wav):
|
229
316
|
wav_np = self.to_numpy_safe(wav)
|
230
|
-
shifted_np = librosa.effects.pitch_shift(
|
317
|
+
shifted_np = librosa.effects.pitch_shift(
|
318
|
+
wav_np, sr=sample_rate, n_steps=n_steps
|
319
|
+
)
|
231
320
|
return torch.from_numpy(shifted_np)
|
232
321
|
|
233
322
|
if audio.ndim == 1:
|
234
323
|
return _shift_one(audio).to(device=src_device, dtype=src_dtype)
|
235
|
-
return torch.stack([_shift_one(a) for a in audio]).to(
|
236
|
-
|
324
|
+
return torch.stack([_shift_one(a) for a in audio]).to(
|
325
|
+
device=src_device, dtype=src_dtype
|
326
|
+
)
|
237
327
|
|
238
328
|
@staticmethod
|
239
|
-
def calc_pitch_fmin(sr:int, frame_length:float):
|
329
|
+
def calc_pitch_fmin(sr: int, frame_length: float):
|
240
330
|
"""For pitch f_min"""
|
241
331
|
return (sr / (frame_length - 1)) * 2
|
242
|
-
|
332
|
+
|
243
333
|
def compute_pitch(
|
244
334
|
self,
|
245
335
|
audio: Tensor,
|
@@ -261,8 +351,10 @@ class AudioProcessor(Model):
|
|
261
351
|
B = 1
|
262
352
|
sr = default(sr, self.cfg.sample_rate)
|
263
353
|
frame_length = default(frame_length, self.cfg.n_fft)
|
264
|
-
fmin = max(
|
265
|
-
|
354
|
+
fmin = max(
|
355
|
+
default(fmin, self.cfg.f_min), self.calc_pitch_fmin(sr, frame_length)
|
356
|
+
)
|
357
|
+
fmax = min(max(default(fmax, self.cfg.default_f_max), fmin + 1), sr // 2)
|
266
358
|
hop_length = default(hop_length, self.cfg.hop_length)
|
267
359
|
center = default(center, self.cfg.center)
|
268
360
|
yn_kwargs = dict(
|
@@ -361,7 +453,7 @@ class AudioProcessor(Model):
|
|
361
453
|
The modes available for upsampling are: `nearest`, `linear` (3D-only),
|
362
454
|
`bilinear`, `bicubic` (4D-only), `trilinear` (5D-only)
|
363
455
|
"""
|
364
|
-
|
456
|
+
tensor = tensor.squeeze()
|
365
457
|
if tensor.ndim == 2: # [1, T]
|
366
458
|
tensor = tensor.unsqueeze(1) # [1, 1, T]
|
367
459
|
elif tensor.ndim == 1:
|
@@ -376,7 +468,7 @@ class AudioProcessor(Model):
|
|
376
468
|
antialias=antialias,
|
377
469
|
)
|
378
470
|
|
379
|
-
def
|
471
|
+
def istft_spec_phase(
|
380
472
|
self,
|
381
473
|
spec: Tensor,
|
382
474
|
phase: Tensor,
|
@@ -384,94 +476,155 @@ class AudioProcessor(Model):
|
|
384
476
|
hop_length: Optional[int] = None,
|
385
477
|
win_length: Optional[int] = None,
|
386
478
|
length: Optional[int] = None,
|
387
|
-
center:
|
479
|
+
center: bool = True,
|
388
480
|
normalized: Optional[bool] = None,
|
389
481
|
onesided: Optional[bool] = None,
|
390
482
|
return_complex: bool = False,
|
391
|
-
*,
|
392
|
-
_recall: bool = False,
|
393
483
|
):
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
484
|
+
"""Util for models that needs to reconstruct the audio using inverse stft"""
|
485
|
+
window = (
|
486
|
+
torch.hann_window(win_length, device=spec.device)
|
487
|
+
if win_length is not None and win_length != self.cfg.win_length
|
488
|
+
else self.window.to(spec.device)
|
489
|
+
)
|
490
|
+
return torch.istft(
|
491
|
+
spec * torch.exp(phase * 1j),
|
492
|
+
n_fft=default(n_fft, self.cfg.n_fft),
|
493
|
+
hop_length=default(hop_length, self.cfg.hop_length),
|
494
|
+
win_length=default(win_length, self.cfg.win_length),
|
495
|
+
window=window,
|
496
|
+
center=center,
|
497
|
+
normalized=default(normalized, self.cfg.normalized),
|
498
|
+
onesided=default(onesided, self.cfg.onesided),
|
499
|
+
length=length,
|
500
|
+
return_complex=return_complex,
|
501
|
+
)
|
398
502
|
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
if not
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
503
|
+
def istft(
|
504
|
+
self,
|
505
|
+
wave: Tensor,
|
506
|
+
n_fft: Optional[int] = None,
|
507
|
+
hop_length: Optional[int] = None,
|
508
|
+
win_length: Optional[int] = None,
|
509
|
+
length: Optional[int] = None,
|
510
|
+
center: bool = True,
|
511
|
+
normalized: Optional[bool] = None,
|
512
|
+
onesided: Optional[bool] = None,
|
513
|
+
return_complex: bool = False,
|
514
|
+
):
|
515
|
+
window = (
|
516
|
+
torch.hann_window(win_length, device=wave.device)
|
517
|
+
if win_length is not None and win_length != self.cfg.win_length
|
518
|
+
else self.window.to(wave.device)
|
519
|
+
)
|
520
|
+
if not torch.is_complex(wave):
|
521
|
+
wave = wave * 1j
|
522
|
+
return torch.istft(
|
523
|
+
wave,
|
524
|
+
n_fft=default(n_fft, self.cfg.n_fft),
|
525
|
+
hop_length=default(hop_length, self.cfg.hop_length),
|
526
|
+
win_length=default(win_length, self.cfg.win_length),
|
527
|
+
window=window,
|
528
|
+
center=center,
|
529
|
+
normalized=default(normalized, self.cfg.normalized),
|
530
|
+
onesided=default(onesided, self.cfg.onesided),
|
531
|
+
length=length,
|
532
|
+
return_complex=return_complex,
|
533
|
+
)
|
534
|
+
|
535
|
+
def stft(
|
536
|
+
self,
|
537
|
+
wave: Tensor,
|
538
|
+
center: bool = True,
|
539
|
+
n_fft: Optional[int] = None,
|
540
|
+
hop_length: Optional[int] = None,
|
541
|
+
win_length: Optional[int] = None,
|
542
|
+
normalized: Optional[bool] = None,
|
543
|
+
onesided: Optional[bool] = None,
|
544
|
+
return_complex: bool = True,
|
545
|
+
):
|
546
|
+
|
547
|
+
window = (
|
548
|
+
torch.hann_window(win_length, device=wave.device)
|
549
|
+
if win_length is not None and win_length != self.cfg.win_length
|
550
|
+
else self.window.to(wave.device)
|
551
|
+
)
|
552
|
+
|
553
|
+
results = torch.stft(
|
554
|
+
input=wave,
|
555
|
+
n_fft=default(n_fft, self.cfg.n_fft),
|
556
|
+
hop_length=default(hop_length, self.cfg.hop_length),
|
557
|
+
win_length=default(win_length, self.cfg.win_length),
|
558
|
+
window=window,
|
559
|
+
center=center,
|
560
|
+
pad_mode="reflect",
|
561
|
+
normalized=default(normalized, self.cfg.normalized),
|
562
|
+
onesided=default(onesided, self.cfg.onesided),
|
563
|
+
return_complex=True, # always, then if we need a not complex type we use view as real.
|
564
|
+
)
|
565
|
+
if not return_complex:
|
566
|
+
return torch.view_as_real(results)
|
567
|
+
return results
|
419
568
|
|
420
569
|
def istft_norm(
|
421
570
|
self,
|
422
571
|
wave: Tensor,
|
423
572
|
length: Optional[int] = None,
|
424
|
-
|
425
|
-
|
573
|
+
center: bool = True,
|
574
|
+
n_fft: Optional[int] = None,
|
575
|
+
hop_length: Optional[int] = None,
|
576
|
+
win_length: Optional[int] = None,
|
577
|
+
normalized: Optional[bool] = None,
|
578
|
+
onesided: Optional[bool] = None,
|
579
|
+
return_complex: bool = False,
|
426
580
|
):
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
raise e
|
581
|
+
window = (
|
582
|
+
torch.hann_window(win_length, device=wave.device)
|
583
|
+
if win_length is not None and win_length != self.cfg.win_length
|
584
|
+
else self.window.to(wave.device)
|
585
|
+
)
|
586
|
+
spectrogram = torch.stft(
|
587
|
+
input=wave,
|
588
|
+
n_fft=default(n_fft, self.cfg.n_fft),
|
589
|
+
hop_length=default(hop_length, self.cfg.hop_length),
|
590
|
+
win_length=default(win_length, self.cfg.win_length),
|
591
|
+
window=window,
|
592
|
+
center=center,
|
593
|
+
pad_mode="reflect",
|
594
|
+
normalized=default(normalized, self.cfg.normalized),
|
595
|
+
onesided=default(onesided, self.cfg.onesided),
|
596
|
+
return_complex=True,
|
597
|
+
)
|
598
|
+
return torch.istft(
|
599
|
+
spectrogram
|
600
|
+
* torch.full(
|
601
|
+
spectrogram.size(),
|
602
|
+
fill_value=1,
|
603
|
+
device=spectrogram.device,
|
604
|
+
),
|
605
|
+
n_fft=default(n_fft, self.cfg.n_fft),
|
606
|
+
hop_length=default(hop_length, self.cfg.hop_length),
|
607
|
+
win_length=default(win_length, self.cfg.win_length),
|
608
|
+
window=self.window,
|
609
|
+
length=length,
|
610
|
+
center=center,
|
611
|
+
normalized=default(normalized, self.cfg.normalized),
|
612
|
+
onesided=default(onesided, self.cfg.onesided),
|
613
|
+
return_complex=return_complex,
|
614
|
+
)
|
462
615
|
|
463
616
|
def load_audio(
|
464
617
|
self,
|
465
618
|
path: PathLike,
|
466
|
-
top_db: float =
|
619
|
+
top_db: Optional[float] = None,
|
467
620
|
normalize: bool = False,
|
621
|
+
mono: bool = True,
|
468
622
|
*,
|
469
|
-
|
470
|
-
frame_length: int = 2048,
|
623
|
+
sample_rate: Optional[float] = None,
|
471
624
|
hop_length: int = 512,
|
472
|
-
|
473
|
-
offset: float = 0.0,
|
625
|
+
frame_length: int = 2048,
|
474
626
|
duration: Optional[float] = None,
|
627
|
+
offset: float = 0.0,
|
475
628
|
dtype: Any = np.float32,
|
476
629
|
res_type: str = "soxr_hq",
|
477
630
|
fix: bool = True,
|
@@ -481,29 +634,32 @@ class AudioProcessor(Model):
|
|
481
634
|
norm_axis: int = 0,
|
482
635
|
norm_threshold: Optional[float] = None,
|
483
636
|
norm_fill: Optional[bool] = None,
|
637
|
+
ref: float | Callable[[np.ndarray], Any] = np.max,
|
484
638
|
) -> Tensor:
|
485
639
|
is_file(path, True)
|
640
|
+
sample_rate = default(sample_rate, self.cfg.sample_rate)
|
486
641
|
wave, sr = librosa.load(
|
487
642
|
str(path),
|
488
|
-
sr=
|
643
|
+
sr=sample_rate,
|
489
644
|
mono=mono,
|
490
645
|
offset=offset,
|
491
646
|
duration=duration,
|
492
647
|
dtype=dtype,
|
493
648
|
res_type=res_type,
|
494
649
|
)
|
495
|
-
|
496
|
-
wave,
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
650
|
+
if top_db is not None:
|
651
|
+
wave, _ = librosa.effects.trim(
|
652
|
+
wave,
|
653
|
+
top_db=top_db,
|
654
|
+
ref=ref,
|
655
|
+
frame_length=frame_length,
|
656
|
+
hop_length=hop_length,
|
657
|
+
)
|
658
|
+
if sr != sample_rate:
|
503
659
|
wave = librosa.resample(
|
504
660
|
wave,
|
505
661
|
orig_sr=sr,
|
506
|
-
target_sr=
|
662
|
+
target_sr=sample_rate,
|
507
663
|
res_type=res_type,
|
508
664
|
fix=fix,
|
509
665
|
scale=scale,
|
@@ -543,10 +699,6 @@ class AudioProcessor(Model):
|
|
543
699
|
maximum,
|
544
700
|
)
|
545
701
|
|
546
|
-
def stft_loss(self, signal: Tensor, ground: Tensor, magnitude: float = 1.0):
|
547
|
-
ground = F.interpolate(ground, signal.shape[-1]).to(signal.device)
|
548
|
-
return F.l1_loss(signal.squeeze(), ground.squeeze()) * magnitude
|
549
|
-
|
550
702
|
def forward(
|
551
703
|
self,
|
552
704
|
*inputs: Union[Tensor, float],
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: lt-tensor
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.1a37
|
4
4
|
Summary: General utilities for PyTorch and others. Built for general use.
|
5
5
|
Home-page: https://github.com/gr1336/lt-tensor/
|
6
6
|
Author: gr1336
|
@@ -18,7 +18,7 @@ Requires-Dist: tokenizers
|
|
18
18
|
Requires-Dist: pyyaml>=6.0.0
|
19
19
|
Requires-Dist: numba>0.60.0
|
20
20
|
Requires-Dist: lt-utils>=0.0.4
|
21
|
-
Requires-Dist: librosa
|
21
|
+
Requires-Dist: librosa<1,>=0.10.2.post1
|
22
22
|
Requires-Dist: einops
|
23
23
|
Requires-Dist: plotly
|
24
24
|
Requires-Dist: scipy
|
@@ -1,4 +1,4 @@
|
|
1
|
-
lt_tensor/__init__.py,sha256=
|
1
|
+
lt_tensor/__init__.py,sha256=CFVK5h2Y-p3xFJ6mCW8dI1FOFeObsOyDjyUqJtxmkmg,441
|
2
2
|
lt_tensor/config_templates.py,sha256=F9UvL8paAjkSvio890kp8WznpYeI50pYnm9iqQroBxk,2797
|
3
3
|
lt_tensor/losses.py,sha256=Heco_WyoC1HkNkcJEircOAzS9umusATHiNAG-FKGyzc,8918
|
4
4
|
lt_tensor/lr_schedulers.py,sha256=6_vcfaPHrozfH3wvmNEdKSFYl6iTIijYoHL8vuG-45U,7651
|
@@ -25,16 +25,19 @@ lt_tensor/model_zoo/activations/alias_free/resample.py,sha256=3iM4fNr9fLNXXMyXvz
|
|
25
25
|
lt_tensor/model_zoo/activations/snake/__init__.py,sha256=AtOAbJuMinxmKkppITGMzRbcbPQaALnl9mCtl1c3x0Q,4356
|
26
26
|
lt_tensor/model_zoo/audio_models/__init__.py,sha256=WwiP9MekJreMOfKPWLl24VkRJIpLk6hhL8ch0aKgOss,103
|
27
27
|
lt_tensor/model_zoo/audio_models/resblocks.py,sha256=u-foHxaFDUICjxSkpyHXljQYQG9zMxVYaOGqLR_nJ-k,7978
|
28
|
-
lt_tensor/model_zoo/audio_models/bigvgan/__init__.py,sha256=
|
28
|
+
lt_tensor/model_zoo/audio_models/bigvgan/__init__.py,sha256=4EZG8Non75dHoDCizMHbMTvPrKwdUlPYGHc7hkfT_nw,8526
|
29
29
|
lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=PDuDYN1omD1RoAXcmxH3tEgfAuM3ZHAWzimD6ElMqEQ,9073
|
30
|
-
lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=
|
30
|
+
lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=ITSXHg3c0Um1P2HaPaXkQKI7meG5Ne60wTbyyYju3hY,6360
|
31
31
|
lt_tensor/model_zoo/audio_models/istft/__init__.py,sha256=blICjLX_z_IFmR3_TCz_dJiSayLYGza9eG6fd9aKyvE,7448
|
32
32
|
lt_tensor/model_zoo/losses/__init__.py,sha256=B9RAUxBiOZwooztnij1oLeRwZ7_MjnN3mPoum7saD6s,59
|
33
|
-
lt_tensor/model_zoo/losses/discriminators.py,sha256=
|
33
|
+
lt_tensor/model_zoo/losses/discriminators.py,sha256=o4cicNdOv0jH3ink7jTNeDqOnwmkmRtEj9E7IUIGnEI,31866
|
34
|
+
lt_tensor/model_zoo/losses/CQT/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
|
+
lt_tensor/model_zoo/losses/CQT/transforms.py,sha256=Vkid0J9dqLnlINfyyUlQf-qB3gOQAgU7W9j7xLOjDFw,13218
|
36
|
+
lt_tensor/model_zoo/losses/CQT/utils.py,sha256=twGw6FVD7V5Ksfx_1BUEN3EP1tAS6wo-9LL3VnuHB8c,16751
|
34
37
|
lt_tensor/processors/__init__.py,sha256=Pvxhh0KR65zLCgUd53_k5Z0y5JWWcO0ZBXFK9rv0o5w,109
|
35
|
-
lt_tensor/processors/audio.py,sha256=
|
36
|
-
lt_tensor-0.0.
|
37
|
-
lt_tensor-0.0.
|
38
|
-
lt_tensor-0.0.
|
39
|
-
lt_tensor-0.0.
|
40
|
-
lt_tensor-0.0.
|
38
|
+
lt_tensor/processors/audio.py,sha256=QadO6e7uXRkheNU8ba-SNw72HPD1XvR-6VJltoF8YRA,23535
|
39
|
+
lt_tensor-0.0.1a37.dist-info/licenses/LICENSE,sha256=TbiyJWLgNqqgqhfCnrGwFIxy7EqGNrIZZcKhHrefcuU,11354
|
40
|
+
lt_tensor-0.0.1a37.dist-info/METADATA,sha256=6EkGRk9fT_wsvl_pqKZ0S8I-x1Awkm9pvr3MKnW8OPM,1071
|
41
|
+
lt_tensor-0.0.1a37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
42
|
+
lt_tensor-0.0.1a37.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
|
43
|
+
lt_tensor-0.0.1a37.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|