lt-tensor 0.0.1a16__py3-none-any.whl → 0.0.1a17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lt_tensor/config_templates.py +23 -15
- lt_tensor/model_base.py +36 -87
- lt_tensor/model_zoo/audio_models/__init__.py +1 -0
- lt_tensor/model_zoo/audio_models/diffwave/__init__.py +217 -1
- {lt_tensor-0.0.1a16.dist-info → lt_tensor-0.0.1a17.dist-info}/METADATA +1 -1
- {lt_tensor-0.0.1a16.dist-info → lt_tensor-0.0.1a17.dist-info}/RECORD +9 -10
- lt_tensor/model_zoo/audio_models/diffwave/model.py +0 -201
- {lt_tensor-0.0.1a16.dist-info → lt_tensor-0.0.1a17.dist-info}/WHEEL +0 -0
- {lt_tensor-0.0.1a16.dist-info → lt_tensor-0.0.1a17.dist-info}/licenses/LICENSE +0 -0
- {lt_tensor-0.0.1a16.dist-info → lt_tensor-0.0.1a17.dist-info}/top_level.txt +0 -0
lt_tensor/config_templates.py
CHANGED
@@ -7,16 +7,14 @@ from lt_tensor.misc_utils import updateDict
|
|
7
7
|
|
8
8
|
class ModelConfig(ABC, OrderedDict):
|
9
9
|
_default_settings: Dict[str, Any] = {}
|
10
|
-
_forbidden_list: List[str] = [
|
11
|
-
"_settings",
|
12
|
-
]
|
10
|
+
_forbidden_list: List[str] = ["_default_settings", "_forbidden_list" "path_name"]
|
13
11
|
|
14
12
|
def __init__(
|
15
13
|
self,
|
16
|
-
settings: Dict[str, Any] =
|
14
|
+
settings: Dict[str, Any] = {},
|
17
15
|
path_name: Optional[Union[str, PathLike]] = None,
|
18
16
|
):
|
19
|
-
assert is_dict(settings)
|
17
|
+
assert is_dict(settings, False)
|
20
18
|
self._default_settings = settings
|
21
19
|
if path_name is not None and is_pathlike(path_name):
|
22
20
|
if not str(path_name).endswith(".json"):
|
@@ -37,31 +35,41 @@ class ModelConfig(ABC, OrderedDict):
|
|
37
35
|
self.path_name += ".json"
|
38
36
|
|
39
37
|
def reset_settings(self):
|
38
|
+
dk_keys = self.__dict__.keys()
|
40
39
|
for s_name, setting in self._default_settings.items():
|
41
|
-
if s_name in self._forbidden_list:
|
40
|
+
if s_name in self._forbidden_list or s_name not in dk_keys:
|
42
41
|
continue
|
43
42
|
updateDict(self, {s_name: setting})
|
44
43
|
|
45
44
|
def save_config(
|
46
45
|
self,
|
47
|
-
path_name: Union[PathLike, str],
|
46
|
+
path_name: Optional[Union[PathLike, str]] = None,
|
48
47
|
):
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
48
|
+
if not is_pathlike(path_name, True):
|
49
|
+
assert (
|
50
|
+
path_name is None
|
51
|
+
), f"path_name should be a non-empty string or pathlike object! received instead: {path_name}."
|
52
|
+
path_name = self.path_name
|
53
|
+
else:
|
54
|
+
self._setup_path_name(path_name)
|
55
55
|
|
56
|
-
|
57
|
-
|
56
|
+
base = self.get_state_dict()
|
57
|
+
save_json(self.path_name, base, indent=2)
|
58
58
|
|
59
59
|
def set_value(self, var_name: str, value: str) -> None:
|
60
|
+
assert var_name in self.__dict__, "Value not registered!"
|
61
|
+
assert var_name not in self._forbidden_list, "Not allowed!"
|
60
62
|
updateDict(self, {var_name: value})
|
61
63
|
|
62
64
|
def get_value(self, var_name: str) -> Any:
|
63
65
|
return self.__dict__.get(var_name)
|
64
66
|
|
67
|
+
def __getattribute__(self, name):
|
68
|
+
return self.__dict__.get(name)
|
69
|
+
|
70
|
+
def get_state_dict(self):
|
71
|
+
return {k: y for k, y in self.__dict__.items() if k not in self._forbidden_list}
|
72
|
+
|
65
73
|
@classmethod
|
66
74
|
def from_dict(
|
67
75
|
cls, dictionary: Dict[str, Any], path: Optional[Union[str, PathLike]] = None
|
lt_tensor/model_base.py
CHANGED
@@ -70,16 +70,6 @@ class LossTracker:
|
|
70
70
|
|
71
71
|
class _Devices_Base(nn.Module):
|
72
72
|
_device: torch.device = ROOT_DEVICE
|
73
|
-
_autocast: bool = False
|
74
|
-
_loss_history: LossTracker = LossTracker(100_000)
|
75
|
-
|
76
|
-
@property
|
77
|
-
def autocast(self):
|
78
|
-
return self._autocast
|
79
|
-
|
80
|
-
@autocast.setter
|
81
|
-
def autocast(self, value: bool):
|
82
|
-
self._autocast = value
|
83
73
|
|
84
74
|
@property
|
85
75
|
def device(self):
|
@@ -90,6 +80,30 @@ class _Devices_Base(nn.Module):
|
|
90
80
|
assert isinstance(device, (str, torch.device))
|
91
81
|
self._device = torch.device(device) if isinstance(device, str) else device
|
92
82
|
|
83
|
+
def _apply_device(self):
|
84
|
+
"""Add here components that are needed to have device applied to them,
|
85
|
+
that usually the '.to()' function fails to apply
|
86
|
+
|
87
|
+
example:
|
88
|
+
```
|
89
|
+
def _apply_device_to(self):
|
90
|
+
self.my_tensor = self.my_tensor.to(device=self.device)
|
91
|
+
```
|
92
|
+
"""
|
93
|
+
pass
|
94
|
+
|
95
|
+
def _to_dvc(
|
96
|
+
self, device_name: str, device_id: Optional[Union[int, torch.device]] = None
|
97
|
+
):
|
98
|
+
device = device_name
|
99
|
+
if device_id is not None:
|
100
|
+
if isinstance(device_id, Number):
|
101
|
+
device += ":" + str(int(device_id))
|
102
|
+
elif hasattr(device_id, "index"):
|
103
|
+
device += ":" + str(device_id.index)
|
104
|
+
self.device = device
|
105
|
+
self._apply_device()
|
106
|
+
|
93
107
|
def to(self, *args, **kwargs):
|
94
108
|
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
|
95
109
|
*args, **kwargs
|
@@ -133,20 +147,9 @@ class _Devices_Base(nn.Module):
|
|
133
147
|
raise
|
134
148
|
|
135
149
|
self._apply(convert)
|
136
|
-
self.device
|
150
|
+
self._to_dvc(device)
|
137
151
|
return self
|
138
152
|
|
139
|
-
def _to_dvc(
|
140
|
-
self, device_name: str, device_id: Optional[Union[int, torch.device]] = None
|
141
|
-
):
|
142
|
-
device = device_name
|
143
|
-
if device_id is not None:
|
144
|
-
if isinstance(device_id, Number):
|
145
|
-
device += ":" + str(int(device_id))
|
146
|
-
elif hasattr(device_id, "index"):
|
147
|
-
device += ":" + str(device_id.index)
|
148
|
-
self.device = device
|
149
|
-
|
150
153
|
def ipu(self, device: Optional[Union[int, torch.device]] = None) -> T:
|
151
154
|
super().ipu(device)
|
152
155
|
self._to_dvc("ipu", device)
|
@@ -178,11 +181,12 @@ class Model(_Devices_Base, ABC):
|
|
178
181
|
This makes it easier to assign a device and retrieves it later
|
179
182
|
"""
|
180
183
|
|
184
|
+
_autocast: bool = False
|
181
185
|
_is_unfrozen: bool = False
|
182
186
|
# list with modules that can be frozen or unfrozen
|
183
187
|
registered_freezable_modules: List[str] = []
|
184
188
|
is_frozen: bool = False
|
185
|
-
|
189
|
+
_can_be_frozen: bool = (
|
186
190
|
False # to control if the module can or cannot be freezed by other modules from 'Model' class
|
187
191
|
)
|
188
192
|
# this is to be used on the case of they module requires low-rank adapters
|
@@ -193,18 +197,15 @@ class Model(_Devices_Base, ABC):
|
|
193
197
|
|
194
198
|
# dont save list:
|
195
199
|
_dont_save_items: List[str] = []
|
200
|
+
_loss_history: LossTracker = LossTracker(20_000)
|
196
201
|
|
197
|
-
|
198
|
-
|
199
|
-
|
202
|
+
@property
|
203
|
+
def autocast(self):
|
204
|
+
return self._autocast
|
200
205
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
self.my_tensor = self.my_tensor.to(device=self.device)
|
205
|
-
```
|
206
|
-
"""
|
207
|
-
pass
|
206
|
+
@autocast.setter
|
207
|
+
def autocast(self, value: bool):
|
208
|
+
self._autocast = value
|
208
209
|
|
209
210
|
def freeze_all(self, exclude: Optional[List[str]] = None):
|
210
211
|
no_exclusions = not exclude
|
@@ -251,7 +252,7 @@ class Model(_Devices_Base, ABC):
|
|
251
252
|
def change_frozen_state(self, freeze: bool, module: nn.Module):
|
252
253
|
try:
|
253
254
|
if isinstance(module, Model):
|
254
|
-
if module.
|
255
|
+
if module._can_be_frozen:
|
255
256
|
if freeze:
|
256
257
|
return module.freeze_all()
|
257
258
|
return module.unfreeze_all()
|
@@ -496,10 +497,7 @@ class Model(_Devices_Base, ABC):
|
|
496
497
|
return self(*inputs, **kwargs)
|
497
498
|
|
498
499
|
def __call__(self, *args, **kwds) -> POSSIBLE_OUTPUT_TYPES:
|
499
|
-
|
500
|
-
with torch.autocast(device_type=self.device.type):
|
501
|
-
return super().__call__(*args, **kwds)
|
502
|
-
else:
|
500
|
+
with torch.autocast(device_type=self.device.type, enabled=self.autocast):
|
503
501
|
return super().__call__(*args, **kwds)
|
504
502
|
|
505
503
|
@abstractmethod
|
@@ -541,52 +539,3 @@ class Model(_Devices_Base, ABC):
|
|
541
539
|
if quantity > 0:
|
542
540
|
t_list = t_list[-quantity:]
|
543
541
|
return sum(t_list) / len(t_list)
|
544
|
-
|
545
|
-
def freeze_unfreeze_loss(
|
546
|
-
self,
|
547
|
-
losses: Optional[Union[float, List[float]]] = None,
|
548
|
-
trigger_loss: Union[float, bool] = 0.1,
|
549
|
-
excluded_modules: Optional[List[str]] = None,
|
550
|
-
max_items: int = 1000,
|
551
|
-
loss_name: str = "train",
|
552
|
-
):
|
553
|
-
"""If a certain threshold is reached the weights will freeze or unfreeze the modules.
|
554
|
-
the biggest use-case for this function is when training GANs where the balance
|
555
|
-
from the discriminator and generator must be kept.
|
556
|
-
|
557
|
-
Args:
|
558
|
-
losses (Union[float, List[float]], Optional): The loss value or a list of losses that will be used to determine if it has reached or not the threshold. Defaults to None.
|
559
|
-
trigger_loss (float, bool, optional): The value where the weights will be either freeze or unfreeze. If set to a boolean it will freeze or unfreeze immediately according to the value (True = Freeze, False = Unfreeze). Defaults to 0.1.
|
560
|
-
excluded_modules (list[str], optional): The list of modules (names) that is not to be changed by either freezing nor unfreezing. Defaults to None.
|
561
|
-
max_items (float, optional): The number of previous losses to be locked behind to calculate the current average. Default to 1000.
|
562
|
-
loss_name (str, optional): Responsible to define with key to recover the loss.
|
563
|
-
returns:
|
564
|
-
bool: True when its frozen and false when its trainable.
|
565
|
-
"""
|
566
|
-
if losses is not None:
|
567
|
-
self.add_loss(losses, "train")
|
568
|
-
|
569
|
-
if isinstance(trigger_loss, bool):
|
570
|
-
if trigger_loss:
|
571
|
-
if self._is_unfrozen:
|
572
|
-
self.freeze_all(excluded_modules)
|
573
|
-
self._is_unfrozen = False
|
574
|
-
return True
|
575
|
-
# else
|
576
|
-
if not self._is_unfrozen:
|
577
|
-
self.unfreeze_all(excluded_modules)
|
578
|
-
self._is_unfrozen = True
|
579
|
-
return False
|
580
|
-
|
581
|
-
value = self.get_loss_avg(loss_name, max_items)
|
582
|
-
|
583
|
-
if value <= trigger_loss:
|
584
|
-
if self._is_unfrozen:
|
585
|
-
self.freeze_all(excluded_modules)
|
586
|
-
self._is_unfrozen = False
|
587
|
-
return True
|
588
|
-
else:
|
589
|
-
if not self._is_unfrozen:
|
590
|
-
self.unfreeze_all(excluded_modules)
|
591
|
-
self._is_unfrozen = True
|
592
|
-
return False
|
@@ -1,3 +1,219 @@
|
|
1
1
|
__all__ = ["DiffWave", "SpectrogramUpsampler", "DiffusionEmbedding"]
|
2
2
|
|
3
|
-
|
3
|
+
import numpy as np
|
4
|
+
import torch
|
5
|
+
import torch.nn as nn
|
6
|
+
import torch.nn.functional as F
|
7
|
+
from lt_tensor.config_templates import ModelConfig
|
8
|
+
from lt_tensor.torch_commons import *
|
9
|
+
from lt_tensor.model_base import Model
|
10
|
+
from math import sqrt
|
11
|
+
from lt_utils.common import *
|
12
|
+
|
13
|
+
F.t
|
14
|
+
|
15
|
+
|
16
|
+
class DiffWaveConfig(ModelConfig):
|
17
|
+
# Training params
|
18
|
+
batch_size = 16
|
19
|
+
learning_rate = 2e-4
|
20
|
+
max_grad_norm = None
|
21
|
+
# Data params
|
22
|
+
sample_rate = 24000
|
23
|
+
n_mels = 80
|
24
|
+
n_fft = 1024
|
25
|
+
hop_samples = 256
|
26
|
+
# Model params
|
27
|
+
residual_layers = 30
|
28
|
+
residual_channels = 64
|
29
|
+
dilation_cycle_length = 10
|
30
|
+
unconditional = False
|
31
|
+
noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist()
|
32
|
+
# settings for auto-fixes
|
33
|
+
interpolate = False
|
34
|
+
interpolation_mode: Literal[
|
35
|
+
"nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"
|
36
|
+
] = "nearest"
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
settings: Dict[str, Any] = {},
|
41
|
+
path_name: Optional[Union[str, PathLike]] = None,
|
42
|
+
):
|
43
|
+
self._forbidden_list.extend()
|
44
|
+
super().__init__(settings, path_name)
|
45
|
+
|
46
|
+
|
47
|
+
def Conv1d(*args, **kwargs):
|
48
|
+
layer = nn.Conv1d(*args, **kwargs)
|
49
|
+
nn.init.kaiming_normal_(layer.weight)
|
50
|
+
return layer
|
51
|
+
|
52
|
+
|
53
|
+
class DiffusionEmbedding(Model):
|
54
|
+
def __init__(self, max_steps: int):
|
55
|
+
super().__init__()
|
56
|
+
self.register_buffer(
|
57
|
+
"embedding", self._build_embedding(max_steps), persistent=False
|
58
|
+
)
|
59
|
+
self.projection1 = nn.Linear(128, 512)
|
60
|
+
self.projection2 = nn.Linear(512, 512)
|
61
|
+
self.activation = nn.SiLU()
|
62
|
+
|
63
|
+
def forward(self, diffusion_step):
|
64
|
+
if diffusion_step.dtype in [torch.int32, torch.int64]:
|
65
|
+
x = self.embedding[diffusion_step]
|
66
|
+
else:
|
67
|
+
x = self._lerp_embedding(diffusion_step)
|
68
|
+
x = self.projection1(x)
|
69
|
+
x = self.activation(x)
|
70
|
+
x = self.projection2(x)
|
71
|
+
x = self.activation(x)
|
72
|
+
return x
|
73
|
+
|
74
|
+
def _lerp_embedding(self, t):
|
75
|
+
low_idx = torch.floor(t).long()
|
76
|
+
high_idx = torch.ceil(t).long()
|
77
|
+
low = self.embedding[low_idx]
|
78
|
+
high = self.embedding[high_idx]
|
79
|
+
return low + (high - low) * (t - low_idx)
|
80
|
+
|
81
|
+
def _build_embedding(self, max_steps):
|
82
|
+
steps = torch.arange(max_steps).unsqueeze(1) # [T,1]
|
83
|
+
dims = torch.arange(64).unsqueeze(0) # [1,64]
|
84
|
+
table = steps * 10.0 ** (dims * 4.0 / 63.0) # [T,64]
|
85
|
+
table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
|
86
|
+
return table
|
87
|
+
|
88
|
+
|
89
|
+
class SpectrogramUpsampler(Model):
|
90
|
+
def __init__(self):
|
91
|
+
super().__init__()
|
92
|
+
self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
|
93
|
+
self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
|
94
|
+
self.activation = nn.LeakyReLU(0.4)
|
95
|
+
|
96
|
+
def forward(self, x):
|
97
|
+
x = torch.unsqueeze(x, 1)
|
98
|
+
x = self.activation(self.conv1(x))
|
99
|
+
x = self.activation(self.conv2(x))
|
100
|
+
x = torch.squeeze(x, 1)
|
101
|
+
return x
|
102
|
+
|
103
|
+
|
104
|
+
class ResidualBlock(Model):
|
105
|
+
def __init__(self, n_mels, residual_channels, dilation, uncond=False):
|
106
|
+
"""
|
107
|
+
:param n_mels: inplanes of conv1x1 for spectrogram conditional
|
108
|
+
:param residual_channels: audio conv
|
109
|
+
:param dilation: audio conv dilation
|
110
|
+
:param uncond: disable spectrogram conditional
|
111
|
+
"""
|
112
|
+
super().__init__()
|
113
|
+
self.dilated_conv = Conv1d(
|
114
|
+
residual_channels,
|
115
|
+
2 * residual_channels,
|
116
|
+
3,
|
117
|
+
padding=dilation,
|
118
|
+
dilation=dilation,
|
119
|
+
)
|
120
|
+
self.diffusion_projection = nn.Linear(512, residual_channels)
|
121
|
+
if not uncond: # conditional model
|
122
|
+
self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
|
123
|
+
else: # unconditional model
|
124
|
+
self.conditioner_projection = None
|
125
|
+
|
126
|
+
self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
|
127
|
+
|
128
|
+
def forward(
|
129
|
+
self,
|
130
|
+
x: Tensor,
|
131
|
+
diffusion_step: Tensor,
|
132
|
+
conditioner: Optional[Tensor] = None,
|
133
|
+
):
|
134
|
+
|
135
|
+
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
|
136
|
+
y = x + diffusion_step
|
137
|
+
if (
|
138
|
+
conditioner is None or self.conditioner_projection is None
|
139
|
+
): # using a unconditional model
|
140
|
+
y = self.dilated_conv(y)
|
141
|
+
else:
|
142
|
+
conditioner = self.conditioner_projection(conditioner)
|
143
|
+
y = self.dilated_conv(y) + conditioner
|
144
|
+
|
145
|
+
gate, filter = torch.chunk(y, 2, dim=1)
|
146
|
+
y = torch.sigmoid(gate) * torch.tanh(filter)
|
147
|
+
|
148
|
+
y = self.output_projection(y)
|
149
|
+
residual, skip = torch.chunk(y, 2, dim=1)
|
150
|
+
return (x + residual) / sqrt(2.0), skip
|
151
|
+
|
152
|
+
|
153
|
+
class DiffWave(Model):
|
154
|
+
def __init__(self, params: DiffWaveConfig = DiffWaveConfig()):
|
155
|
+
super().__init__()
|
156
|
+
self.params = params
|
157
|
+
self.n_hop = self.params.hop_samples
|
158
|
+
self.interpolate = self.params.interpolate
|
159
|
+
self.interpolate_mode = self.params.interpolation_mode
|
160
|
+
self.input_projection = Conv1d(1, params.residual_channels, 1)
|
161
|
+
self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
|
162
|
+
if self.params.unconditional: # use unconditional model
|
163
|
+
self.spectrogram_upsample = None
|
164
|
+
else:
|
165
|
+
self.spectrogram_upsample = SpectrogramUpsampler()
|
166
|
+
|
167
|
+
self.residual_layers = nn.ModuleList(
|
168
|
+
[
|
169
|
+
ResidualBlock(
|
170
|
+
params.n_mels,
|
171
|
+
params.residual_channels,
|
172
|
+
2 ** (i % params.dilation_cycle_length),
|
173
|
+
uncond=params.unconditional,
|
174
|
+
)
|
175
|
+
for i in range(params.residual_layers)
|
176
|
+
]
|
177
|
+
)
|
178
|
+
self.skip_projection = Conv1d(
|
179
|
+
params.residual_channels, params.residual_channels, 1
|
180
|
+
)
|
181
|
+
self.output_projection = Conv1d(params.residual_channels, 1, 1)
|
182
|
+
self.activation = nn.LeakyReLU(0.1)
|
183
|
+
self.r_sqrt = sqrt(len(self.residual_layers))
|
184
|
+
nn.init.zeros_(self.output_projection.weight)
|
185
|
+
|
186
|
+
def forward(
|
187
|
+
self,
|
188
|
+
audio: Tensor,
|
189
|
+
diffusion_step: Tensor,
|
190
|
+
spectrogram: Optional[Tensor] = None,
|
191
|
+
):
|
192
|
+
T = x.shape[-1]
|
193
|
+
if x.ndim == 2:
|
194
|
+
x = audio.unsqueeze(1)
|
195
|
+
x = self.activation(self.input_projection(x))
|
196
|
+
|
197
|
+
diffusion_step = self.diffusion_embedding(diffusion_step)
|
198
|
+
if spectrogram is not None and self.spectrogram_upsample is not None:
|
199
|
+
if self.auto_interpolate:
|
200
|
+
# a little heavy, but helps a lot to fix mismatched shapes,
|
201
|
+
# not always recommended due to data loss
|
202
|
+
spectrogram = F.interpolate(
|
203
|
+
input=spectrogram,
|
204
|
+
size=int(T * self.n_hop),
|
205
|
+
mode=self.interpolate_mode,
|
206
|
+
)
|
207
|
+
spectrogram = self.spectrogram_upsample(spectrogram)
|
208
|
+
|
209
|
+
skip = None
|
210
|
+
for i, layer in enumerate(self.residual_layers):
|
211
|
+
x, skip_connection = layer(x, diffusion_step, spectrogram)
|
212
|
+
if i == 0:
|
213
|
+
skip = skip_connection
|
214
|
+
else:
|
215
|
+
skip = skip_connection + skip
|
216
|
+
x = skip / self.r_sqrt
|
217
|
+
x = self.activation(self.skip_projection(x))
|
218
|
+
x = self.output_projection(x)
|
219
|
+
return x
|
@@ -1,10 +1,10 @@
|
|
1
1
|
lt_tensor/__init__.py,sha256=XxNCGcVL-haJyMpifr-GRaamo32R6jmqe3iOuS4ecfs,469
|
2
|
-
lt_tensor/config_templates.py,sha256=
|
2
|
+
lt_tensor/config_templates.py,sha256=xWZhktYVlkwvJVreqyACpWo-lJ5htG9vTZyqZ6OexzA,3899
|
3
3
|
lt_tensor/losses.py,sha256=zvkCOnE5XpF3v6ymivRIdqPTsMM5zc94ZMom7YDi3zM,4946
|
4
4
|
lt_tensor/lr_schedulers.py,sha256=LSZzqrOOLzSthD8k-W4cYPJt0vCjmHkiJkLr5e3yRTE,3659
|
5
5
|
lt_tensor/math_ops.py,sha256=TkD4WQG42KsQ9Fg7FXOjf8f-ixtW0apf2XjaooecVx4,2257
|
6
6
|
lt_tensor/misc_utils.py,sha256=S57M5XuGsIuaOKnEGZJsY3B2dTmggpdhsqQr51CQsYo,28754
|
7
|
-
lt_tensor/model_base.py,sha256=
|
7
|
+
lt_tensor/model_base.py,sha256=J-f-iQ9qGyYD4NkLljyAEkwtHKKbUKIrIpunMiSmh90,19155
|
8
8
|
lt_tensor/monotonic_align.py,sha256=LhBd8p1xdBzg6jQrQX1j7b4PNeYGwIqM24zcU-pHOLE,2239
|
9
9
|
lt_tensor/noise_tools.py,sha256=wFeAsHhLhSlEc5XU5LbFKaXoHeVxrWjiMeljjGdIKyM,11363
|
10
10
|
lt_tensor/torch_commons.py,sha256=fntsEU8lhBQo0ebonI1iXBkMbWMN3HpBsG13EWlP5s8,718
|
@@ -18,15 +18,14 @@ lt_tensor/model_zoo/fusion.py,sha256=usC1bcjQRNivDc8xzkIS5T1glm78OLcs2V_tPqfp-eI
|
|
18
18
|
lt_tensor/model_zoo/pos_encoder.py,sha256=3d1EYLinCU9UAy-WuEWeYMGhMqaGknCiQ5qEmhw_UYM,4487
|
19
19
|
lt_tensor/model_zoo/residual.py,sha256=i5V4ju7DB3WesKBVm6KH_LyPoKGDUOyo2Usfs-PyP58,9394
|
20
20
|
lt_tensor/model_zoo/transformer.py,sha256=HUFoFFh7EQJErxdd9XIxhssdjvNVx2tNGDJOTUfwG2A,4301
|
21
|
-
lt_tensor/model_zoo/audio_models/__init__.py,sha256=
|
22
|
-
lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=
|
23
|
-
lt_tensor/model_zoo/audio_models/diffwave/model.py,sha256=kHo76bxLJtTBn1m0gq5KKrUsjm9ASsCCwf8MvWaB1R8,6901
|
21
|
+
lt_tensor/model_zoo/audio_models/__init__.py,sha256=CmoakfBLoxqtJuYc1NYrB_z0x1kS2WQNaYQRmCaC5ko,40
|
22
|
+
lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=8DbKJpQ44s9iPlajfs7_A2N1diYGXzkhet_wS4hX6mU,7421
|
24
23
|
lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=BOBZSK2HFOdMcFyjrzwZi_TeAtBGIcpb8pQxiGlwLEE,12302
|
25
24
|
lt_tensor/model_zoo/audio_models/istft/__init__.py,sha256=o7Ie1qI22u_g9t1252PX4vl4uF6JHynAJryuz2lAZE0,12920
|
26
25
|
lt_tensor/processors/__init__.py,sha256=4b9MxAJolXiJfSm20ZEspQTDm1tgLazwlPWA_jB1yLM,63
|
27
26
|
lt_tensor/processors/audio.py,sha256=SMqNSl4Den-x1awTCQ8-TcR-0jPiv5lDaUpU93SRRaw,14749
|
28
|
-
lt_tensor-0.0.
|
29
|
-
lt_tensor-0.0.
|
30
|
-
lt_tensor-0.0.
|
31
|
-
lt_tensor-0.0.
|
32
|
-
lt_tensor-0.0.
|
27
|
+
lt_tensor-0.0.1a17.dist-info/licenses/LICENSE,sha256=HUnu_iSPpnDfZS_PINhO3AoVizJD1A2vee8WX7D7uXo,11358
|
28
|
+
lt_tensor-0.0.1a17.dist-info/METADATA,sha256=nHIAMKShjCnhr2KdEiAhi8IIqP9PoTWnGHgsSfNcTDs,1033
|
29
|
+
lt_tensor-0.0.1a17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
30
|
+
lt_tensor-0.0.1a17.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
|
31
|
+
lt_tensor-0.0.1a17.dist-info/RECORD,,
|
@@ -1,201 +0,0 @@
|
|
1
|
-
__all__ = ["DiffWave", "SpectrogramUpsampler", "DiffusionEmbedding"]
|
2
|
-
import numpy as np
|
3
|
-
import torch
|
4
|
-
import torch.nn as nn
|
5
|
-
import torch.nn.functional as F
|
6
|
-
|
7
|
-
from math import sqrt
|
8
|
-
|
9
|
-
|
10
|
-
class AttrDict(dict):
|
11
|
-
def __init__(self, *args, **kwargs):
|
12
|
-
super(AttrDict, self).__init__(*args, **kwargs)
|
13
|
-
self.__dict__ = self
|
14
|
-
|
15
|
-
def override(self, attrs):
|
16
|
-
if isinstance(attrs, dict):
|
17
|
-
self.__dict__.update(**attrs)
|
18
|
-
elif isinstance(attrs, (list, tuple, set)):
|
19
|
-
for attr in attrs:
|
20
|
-
self.override(attr)
|
21
|
-
elif attrs is not None:
|
22
|
-
raise NotImplementedError
|
23
|
-
return self
|
24
|
-
|
25
|
-
|
26
|
-
params = AttrDict(
|
27
|
-
# Training params
|
28
|
-
batch_size=16,
|
29
|
-
learning_rate=2e-4,
|
30
|
-
max_grad_norm=None,
|
31
|
-
# Data params
|
32
|
-
sample_rate=22050,
|
33
|
-
n_mels=80,
|
34
|
-
n_fft=1024,
|
35
|
-
hop_samples=256,
|
36
|
-
crop_mel_frames=62, # Probably an error in paper.
|
37
|
-
# Model params
|
38
|
-
residual_layers=30,
|
39
|
-
residual_channels=64,
|
40
|
-
dilation_cycle_length=10,
|
41
|
-
unconditional=False,
|
42
|
-
noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(),
|
43
|
-
inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
|
44
|
-
# unconditional sample len
|
45
|
-
audio_len=22050 * 5, # unconditional_synthesis_samples
|
46
|
-
)
|
47
|
-
|
48
|
-
|
49
|
-
def Conv1d(*args, **kwargs):
|
50
|
-
layer = nn.Conv1d(*args, **kwargs)
|
51
|
-
nn.init.kaiming_normal_(layer.weight)
|
52
|
-
return layer
|
53
|
-
|
54
|
-
|
55
|
-
class DiffusionEmbedding(nn.Module):
|
56
|
-
def __init__(self, max_steps):
|
57
|
-
super().__init__()
|
58
|
-
self.register_buffer(
|
59
|
-
"embedding", self._build_embedding(max_steps), persistent=False
|
60
|
-
)
|
61
|
-
self.projection1 = nn.Linear(128, 512)
|
62
|
-
self.projection2 = nn.Linear(512, 512)
|
63
|
-
self.activation = nn.SiLU()
|
64
|
-
|
65
|
-
def forward(self, diffusion_step):
|
66
|
-
if diffusion_step.dtype in [torch.int32, torch.int64]:
|
67
|
-
x = self.embedding[diffusion_step]
|
68
|
-
else:
|
69
|
-
x = self._lerp_embedding(diffusion_step)
|
70
|
-
x = self.projection1(x)
|
71
|
-
x = self.activation(x)
|
72
|
-
x = self.projection2(x)
|
73
|
-
x = self.activation(x)
|
74
|
-
return x
|
75
|
-
|
76
|
-
def _lerp_embedding(self, t):
|
77
|
-
low_idx = torch.floor(t).long()
|
78
|
-
high_idx = torch.ceil(t).long()
|
79
|
-
low = self.embedding[low_idx]
|
80
|
-
high = self.embedding[high_idx]
|
81
|
-
return low + (high - low) * (t - low_idx)
|
82
|
-
|
83
|
-
def _build_embedding(self, max_steps):
|
84
|
-
steps = torch.arange(max_steps).unsqueeze(1) # [T,1]
|
85
|
-
dims = torch.arange(64).unsqueeze(0) # [1,64]
|
86
|
-
table = steps * 10.0 ** (dims * 4.0 / 63.0) # [T,64]
|
87
|
-
table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
|
88
|
-
return table
|
89
|
-
|
90
|
-
|
91
|
-
class SpectrogramUpsampler(nn.Module):
|
92
|
-
def __init__(self, n_mels):
|
93
|
-
super().__init__()
|
94
|
-
self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
|
95
|
-
self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
|
96
|
-
|
97
|
-
def forward(self, x):
|
98
|
-
x = torch.unsqueeze(x, 1)
|
99
|
-
x = self.conv1(x)
|
100
|
-
x = F.leaky_relu(x, 0.4)
|
101
|
-
x = self.conv2(x)
|
102
|
-
x = F.leaky_relu(x, 0.4)
|
103
|
-
x = torch.squeeze(x, 1)
|
104
|
-
return x
|
105
|
-
|
106
|
-
|
107
|
-
class ResidualBlock(nn.Module):
|
108
|
-
def __init__(self, n_mels, residual_channels, dilation, uncond=False):
|
109
|
-
"""
|
110
|
-
:param n_mels: inplanes of conv1x1 for spectrogram conditional
|
111
|
-
:param residual_channels: audio conv
|
112
|
-
:param dilation: audio conv dilation
|
113
|
-
:param uncond: disable spectrogram conditional
|
114
|
-
"""
|
115
|
-
super().__init__()
|
116
|
-
self.dilated_conv = Conv1d(
|
117
|
-
residual_channels,
|
118
|
-
2 * residual_channels,
|
119
|
-
3,
|
120
|
-
padding=dilation,
|
121
|
-
dilation=dilation,
|
122
|
-
)
|
123
|
-
self.diffusion_projection = nn.Linear(512, residual_channels)
|
124
|
-
if not uncond: # conditional model
|
125
|
-
self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
|
126
|
-
else: # unconditional model
|
127
|
-
self.conditioner_projection = None
|
128
|
-
|
129
|
-
self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
|
130
|
-
|
131
|
-
def forward(self, x, diffusion_step, conditioner=None):
|
132
|
-
assert (conditioner is None and self.conditioner_projection is None) or (
|
133
|
-
conditioner is not None and self.conditioner_projection is not None
|
134
|
-
)
|
135
|
-
|
136
|
-
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
|
137
|
-
y = x + diffusion_step
|
138
|
-
if self.conditioner_projection is None: # using a unconditional model
|
139
|
-
y = self.dilated_conv(y)
|
140
|
-
else:
|
141
|
-
conditioner = self.conditioner_projection(conditioner)
|
142
|
-
y = self.dilated_conv(y) + conditioner
|
143
|
-
|
144
|
-
gate, filter = torch.chunk(y, 2, dim=1)
|
145
|
-
y = torch.sigmoid(gate) * torch.tanh(filter)
|
146
|
-
|
147
|
-
y = self.output_projection(y)
|
148
|
-
residual, skip = torch.chunk(y, 2, dim=1)
|
149
|
-
return (x + residual) / sqrt(2.0), skip
|
150
|
-
|
151
|
-
|
152
|
-
class DiffWave(nn.Module):
|
153
|
-
def __init__(self, params):
|
154
|
-
super().__init__()
|
155
|
-
self.params = params
|
156
|
-
self.input_projection = Conv1d(1, params.residual_channels, 1)
|
157
|
-
self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
|
158
|
-
if self.params.unconditional: # use unconditional model
|
159
|
-
self.spectrogram_upsampler = None
|
160
|
-
else:
|
161
|
-
self.spectrogram_upsampler = SpectrogramUpsampler(params.n_mels)
|
162
|
-
|
163
|
-
self.residual_layers = nn.ModuleList(
|
164
|
-
[
|
165
|
-
ResidualBlock(
|
166
|
-
params.n_mels,
|
167
|
-
params.residual_channels,
|
168
|
-
2 ** (i % params.dilation_cycle_length),
|
169
|
-
uncond=params.unconditional,
|
170
|
-
)
|
171
|
-
for i in range(params.residual_layers)
|
172
|
-
]
|
173
|
-
)
|
174
|
-
self.skip_projection = Conv1d(
|
175
|
-
params.residual_channels, params.residual_channels, 1
|
176
|
-
)
|
177
|
-
self.output_projection = Conv1d(params.residual_channels, 1, 1)
|
178
|
-
nn.init.zeros_(self.output_projection.weight)
|
179
|
-
|
180
|
-
def forward(self, audio, diffusion_step, spectrogram=None):
|
181
|
-
assert (spectrogram is None and self.spectrogram_upsampler is None) or (
|
182
|
-
spectrogram is not None and self.spectrogram_upsampler is not None
|
183
|
-
)
|
184
|
-
x = audio.unsqueeze(1)
|
185
|
-
x = self.input_projection(x)
|
186
|
-
x = F.relu(x)
|
187
|
-
|
188
|
-
diffusion_step = self.diffusion_embedding(diffusion_step)
|
189
|
-
if self.spectrogram_upsampler: # use conditional model
|
190
|
-
spectrogram = self.spectrogram_upsampler(spectrogram)
|
191
|
-
|
192
|
-
skip = None
|
193
|
-
for layer in self.residual_layers:
|
194
|
-
x, skip_connection = layer(x, diffusion_step, spectrogram)
|
195
|
-
skip = skip_connection if skip is None else skip_connection + skip
|
196
|
-
|
197
|
-
x = skip / sqrt(len(self.residual_layers))
|
198
|
-
x = self.skip_projection(x)
|
199
|
-
x = F.relu(x)
|
200
|
-
x = self.output_projection(x)
|
201
|
-
return x
|
File without changes
|
File without changes
|
File without changes
|