lt-tensor 0.0.1a16__py3-none-any.whl → 0.0.1a18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,16 +7,14 @@ from lt_tensor.misc_utils import updateDict
7
7
 
8
8
  class ModelConfig(ABC, OrderedDict):
9
9
  _default_settings: Dict[str, Any] = {}
10
- _forbidden_list: List[str] = [
11
- "_settings",
12
- ]
10
+ _forbidden_list: List[str] = ["_default_settings", "_forbidden_list" "path_name"]
13
11
 
14
12
  def __init__(
15
13
  self,
16
- settings: Dict[str, Any] = None,
14
+ settings: Dict[str, Any] = {},
17
15
  path_name: Optional[Union[str, PathLike]] = None,
18
16
  ):
19
- assert is_dict(settings)
17
+ assert is_dict(settings, False)
20
18
  self._default_settings = settings
21
19
  if path_name is not None and is_pathlike(path_name):
22
20
  if not str(path_name).endswith(".json"):
@@ -37,31 +35,41 @@ class ModelConfig(ABC, OrderedDict):
37
35
  self.path_name += ".json"
38
36
 
39
37
  def reset_settings(self):
38
+ dk_keys = self.__dict__.keys()
40
39
  for s_name, setting in self._default_settings.items():
41
- if s_name in self._forbidden_list:
40
+ if s_name in self._forbidden_list or s_name not in dk_keys:
42
41
  continue
43
42
  updateDict(self, {s_name: setting})
44
43
 
45
44
  def save_config(
46
45
  self,
47
- path_name: Union[PathLike, str],
46
+ path_name: Optional[Union[PathLike, str]] = None,
48
47
  ):
49
- assert is_pathlike(
50
- path_name, True
51
- ), f"path_name should be a non-empty string or pathlike object! received instead: {path_name}"
52
- self._setup_path_name(path_name)
53
- base = {k: y for k, y in self.__dict__.items() if k not in self._forbidden_list}
54
- save_json(self.path_name, base, indent=2)
48
+ if not is_pathlike(path_name, True):
49
+ assert (
50
+ path_name is None
51
+ ), f"path_name should be a non-empty string or pathlike object! received instead: {path_name}."
52
+ path_name = self.path_name
53
+ else:
54
+ self._setup_path_name(path_name)
55
55
 
56
- def to_dict(self):
57
- return {k: y for k, y in self.__dict__.items() if k not in self._forbidden_list}
56
+ base = self.get_state_dict()
57
+ save_json(self.path_name, base, indent=2)
58
58
 
59
59
  def set_value(self, var_name: str, value: str) -> None:
60
+ assert var_name in self.__dict__, "Value not registered!"
61
+ assert var_name not in self._forbidden_list, "Not allowed!"
60
62
  updateDict(self, {var_name: value})
61
63
 
62
64
  def get_value(self, var_name: str) -> Any:
63
65
  return self.__dict__.get(var_name)
64
66
 
67
+ def __getattribute__(self, name):
68
+ return self.__dict__.get(name)
69
+
70
+ def get_state_dict(self):
71
+ return {k: y for k, y in self.__dict__.items() if k not in self._forbidden_list}
72
+
65
73
  @classmethod
66
74
  def from_dict(
67
75
  cls, dictionary: Dict[str, Any], path: Optional[Union[str, PathLike]] = None
lt_tensor/model_base.py CHANGED
@@ -70,16 +70,6 @@ class LossTracker:
70
70
 
71
71
  class _Devices_Base(nn.Module):
72
72
  _device: torch.device = ROOT_DEVICE
73
- _autocast: bool = False
74
- _loss_history: LossTracker = LossTracker(100_000)
75
-
76
- @property
77
- def autocast(self):
78
- return self._autocast
79
-
80
- @autocast.setter
81
- def autocast(self, value: bool):
82
- self._autocast = value
83
73
 
84
74
  @property
85
75
  def device(self):
@@ -90,6 +80,30 @@ class _Devices_Base(nn.Module):
90
80
  assert isinstance(device, (str, torch.device))
91
81
  self._device = torch.device(device) if isinstance(device, str) else device
92
82
 
83
+ def _apply_device(self):
84
+ """Add here components that are needed to have device applied to them,
85
+ that usually the '.to()' function fails to apply
86
+
87
+ example:
88
+ ```
89
+ def _apply_device_to(self):
90
+ self.my_tensor = self.my_tensor.to(device=self.device)
91
+ ```
92
+ """
93
+ pass
94
+
95
+ def _to_dvc(
96
+ self, device_name: str, device_id: Optional[Union[int, torch.device]] = None
97
+ ):
98
+ device = device_name
99
+ if device_id is not None:
100
+ if isinstance(device_id, Number):
101
+ device += ":" + str(int(device_id))
102
+ elif hasattr(device_id, "index"):
103
+ device += ":" + str(device_id.index)
104
+ self.device = device
105
+ self._apply_device()
106
+
93
107
  def to(self, *args, **kwargs):
94
108
  device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
95
109
  *args, **kwargs
@@ -133,20 +147,9 @@ class _Devices_Base(nn.Module):
133
147
  raise
134
148
 
135
149
  self._apply(convert)
136
- self.device = device
150
+ self._to_dvc(device)
137
151
  return self
138
152
 
139
- def _to_dvc(
140
- self, device_name: str, device_id: Optional[Union[int, torch.device]] = None
141
- ):
142
- device = device_name
143
- if device_id is not None:
144
- if isinstance(device_id, Number):
145
- device += ":" + str(int(device_id))
146
- elif hasattr(device_id, "index"):
147
- device += ":" + str(device_id.index)
148
- self.device = device
149
-
150
153
  def ipu(self, device: Optional[Union[int, torch.device]] = None) -> T:
151
154
  super().ipu(device)
152
155
  self._to_dvc("ipu", device)
@@ -178,11 +181,12 @@ class Model(_Devices_Base, ABC):
178
181
  This makes it easier to assign a device and retrieves it later
179
182
  """
180
183
 
184
+ _autocast: bool = False
181
185
  _is_unfrozen: bool = False
182
186
  # list with modules that can be frozen or unfrozen
183
187
  registered_freezable_modules: List[str] = []
184
188
  is_frozen: bool = False
185
- _is_gradient_freezable: bool = (
189
+ _can_be_frozen: bool = (
186
190
  False # to control if the module can or cannot be freezed by other modules from 'Model' class
187
191
  )
188
192
  # this is to be used on the case of they module requires low-rank adapters
@@ -193,18 +197,15 @@ class Model(_Devices_Base, ABC):
193
197
 
194
198
  # dont save list:
195
199
  _dont_save_items: List[str] = []
200
+ _loss_history: LossTracker = LossTracker(20_000)
196
201
 
197
- def _apply_device_to(self):
198
- """Add here components that are needed to have device applied to them,
199
- that usually the '.to()' function fails to apply
202
+ @property
203
+ def autocast(self):
204
+ return self._autocast
200
205
 
201
- example:
202
- ```
203
- def _apply_device_to(self):
204
- self.my_tensor = self.my_tensor.to(device=self.device)
205
- ```
206
- """
207
- pass
206
+ @autocast.setter
207
+ def autocast(self, value: bool):
208
+ self._autocast = value
208
209
 
209
210
  def freeze_all(self, exclude: Optional[List[str]] = None):
210
211
  no_exclusions = not exclude
@@ -251,7 +252,7 @@ class Model(_Devices_Base, ABC):
251
252
  def change_frozen_state(self, freeze: bool, module: nn.Module):
252
253
  try:
253
254
  if isinstance(module, Model):
254
- if module._is_gradient_freezable:
255
+ if module._can_be_frozen:
255
256
  if freeze:
256
257
  return module.freeze_all()
257
258
  return module.unfreeze_all()
@@ -496,10 +497,7 @@ class Model(_Devices_Base, ABC):
496
497
  return self(*inputs, **kwargs)
497
498
 
498
499
  def __call__(self, *args, **kwds) -> POSSIBLE_OUTPUT_TYPES:
499
- if self.autocast and not self.training:
500
- with torch.autocast(device_type=self.device.type):
501
- return super().__call__(*args, **kwds)
502
- else:
500
+ with torch.autocast(device_type=self.device.type, enabled=self.autocast):
503
501
  return super().__call__(*args, **kwds)
504
502
 
505
503
  @abstractmethod
@@ -541,52 +539,3 @@ class Model(_Devices_Base, ABC):
541
539
  if quantity > 0:
542
540
  t_list = t_list[-quantity:]
543
541
  return sum(t_list) / len(t_list)
544
-
545
- def freeze_unfreeze_loss(
546
- self,
547
- losses: Optional[Union[float, List[float]]] = None,
548
- trigger_loss: Union[float, bool] = 0.1,
549
- excluded_modules: Optional[List[str]] = None,
550
- max_items: int = 1000,
551
- loss_name: str = "train",
552
- ):
553
- """If a certain threshold is reached the weights will freeze or unfreeze the modules.
554
- the biggest use-case for this function is when training GANs where the balance
555
- from the discriminator and generator must be kept.
556
-
557
- Args:
558
- losses (Union[float, List[float]], Optional): The loss value or a list of losses that will be used to determine if it has reached or not the threshold. Defaults to None.
559
- trigger_loss (float, bool, optional): The value where the weights will be either freeze or unfreeze. If set to a boolean it will freeze or unfreeze immediately according to the value (True = Freeze, False = Unfreeze). Defaults to 0.1.
560
- excluded_modules (list[str], optional): The list of modules (names) that is not to be changed by either freezing nor unfreezing. Defaults to None.
561
- max_items (float, optional): The number of previous losses to be locked behind to calculate the current average. Default to 1000.
562
- loss_name (str, optional): Responsible to define with key to recover the loss.
563
- returns:
564
- bool: True when its frozen and false when its trainable.
565
- """
566
- if losses is not None:
567
- self.add_loss(losses, "train")
568
-
569
- if isinstance(trigger_loss, bool):
570
- if trigger_loss:
571
- if self._is_unfrozen:
572
- self.freeze_all(excluded_modules)
573
- self._is_unfrozen = False
574
- return True
575
- # else
576
- if not self._is_unfrozen:
577
- self.unfreeze_all(excluded_modules)
578
- self._is_unfrozen = True
579
- return False
580
-
581
- value = self.get_loss_avg(loss_name, max_items)
582
-
583
- if value <= trigger_loss:
584
- if self._is_unfrozen:
585
- self.freeze_all(excluded_modules)
586
- self._is_unfrozen = False
587
- return True
588
- else:
589
- if not self._is_unfrozen:
590
- self.unfreeze_all(excluded_modules)
591
- self._is_unfrozen = True
592
- return False
@@ -1 +1,3 @@
1
1
  from . import diffwave, istft, hifigan
2
+
3
+ __all__ = ["diffwave", "istft", "hifigan"]
@@ -1,3 +1,216 @@
1
- __all__ = ["DiffWave", "SpectrogramUpsampler", "DiffusionEmbedding"]
1
+ __all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsample", "DiffusionEmbedding"]
2
2
 
3
- from .model import DiffWave, SpectrogramUpsampler, DiffusionEmbedding
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from lt_tensor.config_templates import ModelConfig
8
+ from lt_tensor.torch_commons import *
9
+ from lt_tensor.model_base import Model
10
+ from math import sqrt
11
+ from lt_utils.common import *
12
+
13
+
14
+ class DiffWaveConfig(ModelConfig):
15
+ # Training params
16
+ batch_size = 16
17
+ learning_rate = 2e-4
18
+ max_grad_norm = None
19
+ # Data params
20
+ sample_rate = 24000
21
+ n_mels = 80
22
+ n_fft = 1024
23
+ hop_samples = 256
24
+ # Model params
25
+ residual_layers = 30
26
+ residual_channels = 64
27
+ dilation_cycle_length = 10
28
+ unconditional = False
29
+ noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist()
30
+ # settings for auto-fixes
31
+ interpolate = False
32
+ interpolation_mode: Literal[
33
+ "nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"
34
+ ] = "nearest"
35
+
36
+ def __init__(
37
+ self,
38
+ settings: Dict[str, Any] = {},
39
+ path_name: Optional[Union[str, PathLike]] = None,
40
+ ):
41
+ super().__init__(settings, path_name)
42
+
43
+
44
+ def Conv1d(*args, **kwargs):
45
+ layer = nn.Conv1d(*args, **kwargs)
46
+ nn.init.kaiming_normal_(layer.weight)
47
+ return layer
48
+
49
+
50
+ class DiffusionEmbedding(Model):
51
+ def __init__(self, max_steps: int):
52
+ super().__init__()
53
+ self.register_buffer(
54
+ "embedding", self._build_embedding(max_steps), persistent=False
55
+ )
56
+ self.projection1 = nn.Linear(128, 512)
57
+ self.projection2 = nn.Linear(512, 512)
58
+ self.activation = nn.SiLU()
59
+
60
+ def forward(self, diffusion_step):
61
+ if diffusion_step.dtype in [torch.int32, torch.int64]:
62
+ x = self.embedding[diffusion_step]
63
+ else:
64
+ x = self._lerp_embedding(diffusion_step)
65
+ x = self.projection1(x)
66
+ x = self.activation(x)
67
+ x = self.projection2(x)
68
+ x = self.activation(x)
69
+ return x
70
+
71
+ def _lerp_embedding(self, t):
72
+ low_idx = torch.floor(t).long()
73
+ high_idx = torch.ceil(t).long()
74
+ low = self.embedding[low_idx]
75
+ high = self.embedding[high_idx]
76
+ return low + (high - low) * (t - low_idx)
77
+
78
+ def _build_embedding(self, max_steps):
79
+ steps = torch.arange(max_steps).unsqueeze(1) # [T,1]
80
+ dims = torch.arange(64).unsqueeze(0) # [1,64]
81
+ table = steps * 10.0 ** (dims * 4.0 / 63.0) # [T,64]
82
+ table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
83
+ return table
84
+
85
+
86
+ class SpectrogramUpsample(Model):
87
+ def __init__(self):
88
+ super().__init__()
89
+ self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
90
+ self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
91
+ self.activation = nn.LeakyReLU(0.4)
92
+
93
+ def forward(self, x):
94
+ x = torch.unsqueeze(x, 1)
95
+ x = self.activation(self.conv1(x))
96
+ x = self.activation(self.conv2(x))
97
+ x = torch.squeeze(x, 1)
98
+ return x
99
+
100
+
101
+ class ResidualBlock(Model):
102
+ def __init__(self, n_mels, residual_channels, dilation, uncond=False):
103
+ """
104
+ :param n_mels: inplanes of conv1x1 for spectrogram conditional
105
+ :param residual_channels: audio conv
106
+ :param dilation: audio conv dilation
107
+ :param uncond: disable spectrogram conditional
108
+ """
109
+ super().__init__()
110
+ self.dilated_conv = Conv1d(
111
+ residual_channels,
112
+ 2 * residual_channels,
113
+ 3,
114
+ padding=dilation,
115
+ dilation=dilation,
116
+ )
117
+ self.diffusion_projection = nn.Linear(512, residual_channels)
118
+ if not uncond: # conditional model
119
+ self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
120
+ else: # unconditional model
121
+ self.conditioner_projection = None
122
+
123
+ self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
124
+
125
+ def forward(
126
+ self,
127
+ x: Tensor,
128
+ diffusion_step: Tensor,
129
+ conditioner: Optional[Tensor] = None,
130
+ ):
131
+
132
+ diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
133
+ y = x + diffusion_step
134
+ if (
135
+ conditioner is None or self.conditioner_projection is None
136
+ ): # using a unconditional model
137
+ y = self.dilated_conv(y)
138
+ else:
139
+ conditioner = self.conditioner_projection(conditioner)
140
+ y = self.dilated_conv(y) + conditioner
141
+
142
+ gate, filter = torch.chunk(y, 2, dim=1)
143
+ y = torch.sigmoid(gate) * torch.tanh(filter)
144
+
145
+ y = self.output_projection(y)
146
+ residual, skip = torch.chunk(y, 2, dim=1)
147
+ return (x + residual) / sqrt(2.0), skip
148
+
149
+
150
+ class DiffWave(Model):
151
+ def __init__(self, params: DiffWaveConfig = DiffWaveConfig()):
152
+ super().__init__()
153
+ self.params = params
154
+ self.n_hop = self.params.hop_samples
155
+ self.interpolate = self.params.interpolate
156
+ self.interpolate_mode = self.params.interpolation_mode
157
+ self.input_projection = Conv1d(1, params.residual_channels, 1)
158
+ self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
159
+ if self.params.unconditional: # use unconditional model
160
+ self.spectrogram_upsample = None
161
+ else:
162
+ self.spectrogram_upsample = SpectrogramUpsample()
163
+
164
+ self.residual_layers = nn.ModuleList(
165
+ [
166
+ ResidualBlock(
167
+ params.n_mels,
168
+ params.residual_channels,
169
+ 2 ** (i % params.dilation_cycle_length),
170
+ uncond=params.unconditional,
171
+ )
172
+ for i in range(params.residual_layers)
173
+ ]
174
+ )
175
+ self.skip_projection = Conv1d(
176
+ params.residual_channels, params.residual_channels, 1
177
+ )
178
+ self.output_projection = Conv1d(params.residual_channels, 1, 1)
179
+ self.activation = nn.LeakyReLU(0.1)
180
+ self.r_sqrt = sqrt(len(self.residual_layers))
181
+ nn.init.zeros_(self.output_projection.weight)
182
+
183
+ def forward(
184
+ self,
185
+ audio: Tensor,
186
+ diffusion_step: Tensor,
187
+ spectrogram: Optional[Tensor] = None,
188
+ ):
189
+ T = x.shape[-1]
190
+ if x.ndim == 2:
191
+ x = audio.unsqueeze(1)
192
+ x = self.activation(self.input_projection(x))
193
+
194
+ diffusion_step = self.diffusion_embedding(diffusion_step)
195
+ if spectrogram is not None and self.spectrogram_upsample is not None:
196
+ if self.auto_interpolate:
197
+ # a little heavy, but helps a lot to fix mismatched shapes,
198
+ # not always recommended due to data loss
199
+ spectrogram = F.interpolate(
200
+ input=spectrogram,
201
+ size=int(T * self.n_hop),
202
+ mode=self.interpolate_mode,
203
+ )
204
+ spectrogram = self.spectrogram_upsample(spectrogram)
205
+
206
+ skip = None
207
+ for i, layer in enumerate(self.residual_layers):
208
+ x, skip_connection = layer(x, diffusion_step, spectrogram)
209
+ if i == 0:
210
+ skip = skip_connection
211
+ else:
212
+ skip = skip_connection + skip
213
+ x = skip / self.r_sqrt
214
+ x = self.activation(self.skip_projection(x))
215
+ x = self.output_projection(x)
216
+ return x
@@ -1,4 +1,4 @@
1
- __all__ = ["HifiganGenerator"]
1
+ __all__ = ["HifiganGenerator", "HifiganConfig"]
2
2
  from lt_utils.common import *
3
3
  from lt_tensor.torch_commons import *
4
4
  from lt_tensor.model_zoo.residual import ConvNets
@@ -13,6 +13,33 @@ def get_padding(kernel_size, dilation=1):
13
13
  return int((kernel_size * dilation - dilation) / 2)
14
14
 
15
15
 
16
+ from lt_tensor.config_templates import ModelConfig
17
+
18
+
19
+ class HifiganConfig(ModelConfig):
20
+ # Training params
21
+ in_channels: int = 80
22
+ upsample_rates: List[Union[int, List[int]]] = [8, 8]
23
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16]
24
+ upsample_initial_channel: int = (512,)
25
+ resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
26
+ resblock_dilation_sizes: List[Union[int, List[int]]] = [
27
+ [1, 3, 5],
28
+ [1, 3, 5],
29
+ [1, 3, 5],
30
+ ]
31
+
32
+ activation: nn.Module = nn.LeakyReLU(0.1)
33
+ resblock: int = 0
34
+
35
+ def __init__(
36
+ self,
37
+ settings: Dict[str, Any] = {},
38
+ path_name: Optional[Union[str, PathLike]] = None,
39
+ ):
40
+ super().__init__(settings, path_name)
41
+
42
+
16
43
  class ResBlock1(ConvNets):
17
44
  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
18
45
  super().__init__()
@@ -142,23 +169,23 @@ class ResBlock2(ConvNets):
142
169
 
143
170
 
144
171
  class HifiganGenerator(ConvNets):
145
- def __init__(self, h):
172
+ def __init__(self, cfg: HifiganConfig = HifiganConfig()):
146
173
  super().__init__()
147
- self.h = h
148
- self.num_kernels = len(h.resblock_kernel_sizes)
149
- self.num_upsamples = len(h.upsample_rates)
174
+ self.cfg = cfg
175
+ self.num_kernels = len(cfg.resblock_kernel_sizes)
176
+ self.num_upsamples = len(cfg.upsample_rates)
150
177
  self.conv_pre = weight_norm(
151
- nn.Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
178
+ nn.Conv1d(cfg.in_channels, cfg.upsample_initial_channel, 7, 1, padding=3)
152
179
  )
153
- resblock = ResBlock1 if h.resblock == "1" else ResBlock2
154
- self.activation = nn.LeakyReLU(0.1)
180
+ resblock = ResBlock1 if resblock == 0 else ResBlock2
181
+ self.activation = cfg.activation
155
182
  self.ups = nn.ModuleList()
156
- for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
183
+ for i, (u, k) in enumerate(zip(cfg.psample_rates, cfg.upsample_kernel_sizes)):
157
184
  self.ups.append(
158
185
  weight_norm(
159
186
  nn.ConvTranspose1d(
160
- h.upsample_initial_channel // (2**i),
161
- h.upsample_initial_channel // (2 ** (i + 1)),
187
+ cfg.upsample_initial_channel // (2**i),
188
+ cfg.upsample_initial_channel // (2 ** (i + 1)),
162
189
  k,
163
190
  u,
164
191
  padding=(k - u) // 2,
@@ -168,17 +195,17 @@ class HifiganGenerator(ConvNets):
168
195
 
169
196
  self.resblocks = nn.ModuleList()
170
197
  for i in range(len(self.ups)):
171
- ch = h.upsample_initial_channel // (2 ** (i + 1))
198
+ ch = cfg.upsample_initial_channel // (2 ** (i + 1))
172
199
  for j, (k, d) in enumerate(
173
- zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
200
+ zip(cfg.resblock_kernel_sizes, cfg.resblock_dilation_sizes)
174
201
  ):
175
- self.resblocks.append(resblock(h, ch, k, d))
202
+ self.resblocks.append(resblock(ch, k, d))
176
203
 
177
204
  self.conv_post = weight_norm(nn.Conv1d(ch, 1, 7, 1, padding=3))
178
205
  self.ups.apply(self.init_weights)
179
206
  self.conv_post.apply(self.init_weights)
180
207
 
181
- def forward(self, x):
208
+ def forward(self, x: Tensor):
182
209
  x = self.conv_pre(x)
183
210
  for i in range(self.num_upsamples):
184
211
  x = self.ups[i](self.activation(x))
@@ -1,8 +1,35 @@
1
- __all__ = ["iSTFTGenerator"]
1
+ __all__ = ["iSTFTNetGenerator", "iSTFTNetConfig"]
2
2
  from lt_utils.common import *
3
3
  from lt_tensor.torch_commons import *
4
4
  from lt_tensor.model_zoo.residual import ConvNets
5
5
  from torch.nn import functional as F
6
+ from lt_tensor.config_templates import ModelConfig
7
+
8
+
9
+ class iSTFTNetConfig(ModelConfig):
10
+ # Training params
11
+ in_channels: int = 80
12
+ upsample_rates: List[Union[int, List[int]]] = [8, 8]
13
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16]
14
+ upsample_initial_channel: int = (512,)
15
+ resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
16
+ resblock_dilation_sizes: List[Union[int, List[int]]] = [
17
+ [1, 3, 5],
18
+ [1, 3, 5],
19
+ [1, 3, 5],
20
+ ]
21
+
22
+ activation: nn.Module = nn.LeakyReLU(0.1)
23
+ resblock: int = 0
24
+ gen_istft_n_fft: int = 16
25
+ sampling_rate: Number = 24000
26
+
27
+ def __init__(
28
+ self,
29
+ settings: Dict[str, Any] = {},
30
+ path_name: Optional[Union[str, PathLike]] = None,
31
+ ):
32
+ super().__init__(settings, path_name)
6
33
 
7
34
 
8
35
  def get_padding(ks, d):
@@ -10,9 +37,8 @@ def get_padding(ks, d):
10
37
 
11
38
 
12
39
  class ResBlock1(ConvNets):
13
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
40
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
14
41
  super().__init__()
15
- self.h = h
16
42
  self.convs1 = nn.ModuleList(
17
43
  [
18
44
  weight_norm(
@@ -95,10 +121,10 @@ class ResBlock1(ConvNets):
95
121
  x = xt + x
96
122
  return x
97
123
 
124
+
98
125
  class ResBlock2(ConvNets):
99
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
126
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
100
127
  super().__init__()
101
- self.h = h
102
128
  self.convs = nn.ModuleList(
103
129
  [
104
130
  weight_norm(
@@ -134,25 +160,25 @@ class ResBlock2(ConvNets):
134
160
  return x
135
161
 
136
162
 
137
- class iSTFTGenerator(ConvNets):
138
- def __init__(self, h):
163
+ class iSTFTNetGenerator(ConvNets):
164
+ def __init__(self, cfg: iSTFTNetConfig = iSTFTNetConfig()):
139
165
  super().__init__()
140
- self.h = h
141
- self.num_kernels = len(h.resblock_kernel_sizes)
142
- self.num_upsamples = len(h.upsample_rates)
166
+ self.cfg = cfg
167
+ self.num_kernels = len(cfg.resblock_kernel_sizes)
168
+ self.num_upsamples = len(cfg.upsample_rates)
143
169
  self.conv_pre = weight_norm(
144
- nn.Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
170
+ nn.Conv1d(cfg.in_channels, cfg.upsample_initial_channel, 7, 1, padding=3)
145
171
  )
146
- resblock = ResBlock1 if h.resblock == "1" else ResBlock2
172
+ resblock = ResBlock1 if resblock == 0 else ResBlock2
147
173
 
148
174
  self.ups = nn.ModuleList()
149
- for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
150
- if h.sampling_rate % 16000:
175
+ for i, (u, k) in enumerate(zip(cfg.upsample_rates, cfg.upsample_kernel_sizes)):
176
+ if cfg.sampling_rate % 16000:
151
177
  self.ups.append(
152
178
  weight_norm(
153
179
  nn.ConvTranspose1d(
154
- h.upsample_initial_channel // (2**i),
155
- h.upsample_initial_channel // (2 ** (i + 1)),
180
+ cfg.upsample_initial_channel // (2**i),
181
+ cfg.upsample_initial_channel // (2 ** (i + 1)),
156
182
  k,
157
183
  u,
158
184
  padding=(k - u) // 2,
@@ -163,8 +189,8 @@ class iSTFTGenerator(ConvNets):
163
189
  self.ups.append(
164
190
  weight_norm(
165
191
  nn.ConvTranspose1d(
166
- h.upsample_initial_channel // (2**i),
167
- h.upsample_initial_channel // (2 ** (i + 1)),
192
+ cfg.upsample_initial_channel // (2**i),
193
+ cfg.upsample_initial_channel // (2 ** (i + 1)),
168
194
  k,
169
195
  u,
170
196
  padding=(u // 2 + u % 2),
@@ -175,19 +201,19 @@ class iSTFTGenerator(ConvNets):
175
201
 
176
202
  self.resblocks = nn.ModuleList()
177
203
  for i in range(len(self.ups)):
178
- ch = h.upsample_initial_channel // (2 ** (i + 1))
204
+ ch = cfg.upsample_initial_channel // (2 ** (i + 1))
179
205
  for j, (k, d) in enumerate(
180
- zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
206
+ zip(cfg.resblock_kernel_sizes, cfg.resblock_dilation_sizes)
181
207
  ):
182
- self.resblocks.append(resblock(h, ch, k, d))
208
+ self.resblocks.append(resblock(ch, k, d))
183
209
 
184
- self.post_n_fft = h.gen_istft_n_fft
210
+ self.post_n_fft = cfg.gen_istft_n_fft
185
211
  self.conv_post = weight_norm(
186
212
  nn.Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3)
187
213
  )
188
214
  self.ups.apply(self.init_weights)
189
215
  self.conv_post.apply(self.init_weights)
190
- self.activation = nn.LeakyReLU(0.1)
216
+ self.activation = cfg.activation
191
217
  self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
192
218
 
193
219
  def forward(self, x):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lt-tensor
3
- Version: 0.0.1a16
3
+ Version: 0.0.1a18
4
4
  Summary: General utilities for PyTorch and others. Built for general use.
5
5
  Home-page: https://github.com/gr1336/lt-tensor/
6
6
  Author: gr1336
@@ -1,10 +1,10 @@
1
1
  lt_tensor/__init__.py,sha256=XxNCGcVL-haJyMpifr-GRaamo32R6jmqe3iOuS4ecfs,469
2
- lt_tensor/config_templates.py,sha256=FRN4-i1amoqMh_wyp4gNsw61ABWTIhGC62Uc3l3SNss,3515
2
+ lt_tensor/config_templates.py,sha256=xWZhktYVlkwvJVreqyACpWo-lJ5htG9vTZyqZ6OexzA,3899
3
3
  lt_tensor/losses.py,sha256=zvkCOnE5XpF3v6ymivRIdqPTsMM5zc94ZMom7YDi3zM,4946
4
4
  lt_tensor/lr_schedulers.py,sha256=LSZzqrOOLzSthD8k-W4cYPJt0vCjmHkiJkLr5e3yRTE,3659
5
5
  lt_tensor/math_ops.py,sha256=TkD4WQG42KsQ9Fg7FXOjf8f-ixtW0apf2XjaooecVx4,2257
6
6
  lt_tensor/misc_utils.py,sha256=S57M5XuGsIuaOKnEGZJsY3B2dTmggpdhsqQr51CQsYo,28754
7
- lt_tensor/model_base.py,sha256=qqqIVpYz6nv01MnZuuAj1dxq4_NN-zSivP1GaegA9TI,21597
7
+ lt_tensor/model_base.py,sha256=J-f-iQ9qGyYD4NkLljyAEkwtHKKbUKIrIpunMiSmh90,19155
8
8
  lt_tensor/monotonic_align.py,sha256=LhBd8p1xdBzg6jQrQX1j7b4PNeYGwIqM24zcU-pHOLE,2239
9
9
  lt_tensor/noise_tools.py,sha256=wFeAsHhLhSlEc5XU5LbFKaXoHeVxrWjiMeljjGdIKyM,11363
10
10
  lt_tensor/torch_commons.py,sha256=fntsEU8lhBQo0ebonI1iXBkMbWMN3HpBsG13EWlP5s8,718
@@ -18,15 +18,14 @@ lt_tensor/model_zoo/fusion.py,sha256=usC1bcjQRNivDc8xzkIS5T1glm78OLcs2V_tPqfp-eI
18
18
  lt_tensor/model_zoo/pos_encoder.py,sha256=3d1EYLinCU9UAy-WuEWeYMGhMqaGknCiQ5qEmhw_UYM,4487
19
19
  lt_tensor/model_zoo/residual.py,sha256=i5V4ju7DB3WesKBVm6KH_LyPoKGDUOyo2Usfs-PyP58,9394
20
20
  lt_tensor/model_zoo/transformer.py,sha256=HUFoFFh7EQJErxdd9XIxhssdjvNVx2tNGDJOTUfwG2A,4301
21
- lt_tensor/model_zoo/audio_models/__init__.py,sha256=CLoLqvbA_ltqm3KOg5AH3A0co0HtsLfFPUBsxxLSCgI,39
22
- lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=aFSmr8PYpmOfbe15lhNoj-ZzP5ChrZcikovKLZKg7nw,140
23
- lt_tensor/model_zoo/audio_models/diffwave/model.py,sha256=kHo76bxLJtTBn1m0gq5KKrUsjm9ASsCCwf8MvWaB1R8,6901
24
- lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=BOBZSK2HFOdMcFyjrzwZi_TeAtBGIcpb8pQxiGlwLEE,12302
25
- lt_tensor/model_zoo/audio_models/istft/__init__.py,sha256=o7Ie1qI22u_g9t1252PX4vl4uF6JHynAJryuz2lAZE0,12920
21
+ lt_tensor/model_zoo/audio_models/__init__.py,sha256=MoG9YjxLyvscq_6njK1ljGBletK9iedBXt66bplzW-s,83
22
+ lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=R14hY-nCbCO-T3ox9f4MXCPgQQogFUKAJ2WtntLz09w,7393
23
+ lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=6ZGYyNiTMGHnOjGU0gq_TSM8Y9LtYlP3neGwa01Ghyk,13135
24
+ lt_tensor/model_zoo/audio_models/istft/__init__.py,sha256=noi4GLGZQ_qg5H-ipe5d7j8rvt4Hic_sXiME-TE-B2c,13783
26
25
  lt_tensor/processors/__init__.py,sha256=4b9MxAJolXiJfSm20ZEspQTDm1tgLazwlPWA_jB1yLM,63
27
26
  lt_tensor/processors/audio.py,sha256=SMqNSl4Den-x1awTCQ8-TcR-0jPiv5lDaUpU93SRRaw,14749
28
- lt_tensor-0.0.1a16.dist-info/licenses/LICENSE,sha256=HUnu_iSPpnDfZS_PINhO3AoVizJD1A2vee8WX7D7uXo,11358
29
- lt_tensor-0.0.1a16.dist-info/METADATA,sha256=uxk1cMeQkLniYUIgEjHD2eJ8_JGwAKS2minrCmAJfMo,1033
30
- lt_tensor-0.0.1a16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- lt_tensor-0.0.1a16.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
32
- lt_tensor-0.0.1a16.dist-info/RECORD,,
27
+ lt_tensor-0.0.1a18.dist-info/licenses/LICENSE,sha256=HUnu_iSPpnDfZS_PINhO3AoVizJD1A2vee8WX7D7uXo,11358
28
+ lt_tensor-0.0.1a18.dist-info/METADATA,sha256=fgRzOiw5tMmkaEY9HrGEKNL2v9mN5JVbf9r-bf18Am0,1033
29
+ lt_tensor-0.0.1a18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
30
+ lt_tensor-0.0.1a18.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
31
+ lt_tensor-0.0.1a18.dist-info/RECORD,,
@@ -1,201 +0,0 @@
1
- __all__ = ["DiffWave", "SpectrogramUpsampler", "DiffusionEmbedding"]
2
- import numpy as np
3
- import torch
4
- import torch.nn as nn
5
- import torch.nn.functional as F
6
-
7
- from math import sqrt
8
-
9
-
10
- class AttrDict(dict):
11
- def __init__(self, *args, **kwargs):
12
- super(AttrDict, self).__init__(*args, **kwargs)
13
- self.__dict__ = self
14
-
15
- def override(self, attrs):
16
- if isinstance(attrs, dict):
17
- self.__dict__.update(**attrs)
18
- elif isinstance(attrs, (list, tuple, set)):
19
- for attr in attrs:
20
- self.override(attr)
21
- elif attrs is not None:
22
- raise NotImplementedError
23
- return self
24
-
25
-
26
- params = AttrDict(
27
- # Training params
28
- batch_size=16,
29
- learning_rate=2e-4,
30
- max_grad_norm=None,
31
- # Data params
32
- sample_rate=22050,
33
- n_mels=80,
34
- n_fft=1024,
35
- hop_samples=256,
36
- crop_mel_frames=62, # Probably an error in paper.
37
- # Model params
38
- residual_layers=30,
39
- residual_channels=64,
40
- dilation_cycle_length=10,
41
- unconditional=False,
42
- noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(),
43
- inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
44
- # unconditional sample len
45
- audio_len=22050 * 5, # unconditional_synthesis_samples
46
- )
47
-
48
-
49
- def Conv1d(*args, **kwargs):
50
- layer = nn.Conv1d(*args, **kwargs)
51
- nn.init.kaiming_normal_(layer.weight)
52
- return layer
53
-
54
-
55
- class DiffusionEmbedding(nn.Module):
56
- def __init__(self, max_steps):
57
- super().__init__()
58
- self.register_buffer(
59
- "embedding", self._build_embedding(max_steps), persistent=False
60
- )
61
- self.projection1 = nn.Linear(128, 512)
62
- self.projection2 = nn.Linear(512, 512)
63
- self.activation = nn.SiLU()
64
-
65
- def forward(self, diffusion_step):
66
- if diffusion_step.dtype in [torch.int32, torch.int64]:
67
- x = self.embedding[diffusion_step]
68
- else:
69
- x = self._lerp_embedding(diffusion_step)
70
- x = self.projection1(x)
71
- x = self.activation(x)
72
- x = self.projection2(x)
73
- x = self.activation(x)
74
- return x
75
-
76
- def _lerp_embedding(self, t):
77
- low_idx = torch.floor(t).long()
78
- high_idx = torch.ceil(t).long()
79
- low = self.embedding[low_idx]
80
- high = self.embedding[high_idx]
81
- return low + (high - low) * (t - low_idx)
82
-
83
- def _build_embedding(self, max_steps):
84
- steps = torch.arange(max_steps).unsqueeze(1) # [T,1]
85
- dims = torch.arange(64).unsqueeze(0) # [1,64]
86
- table = steps * 10.0 ** (dims * 4.0 / 63.0) # [T,64]
87
- table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
88
- return table
89
-
90
-
91
- class SpectrogramUpsampler(nn.Module):
92
- def __init__(self, n_mels):
93
- super().__init__()
94
- self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
95
- self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
96
-
97
- def forward(self, x):
98
- x = torch.unsqueeze(x, 1)
99
- x = self.conv1(x)
100
- x = F.leaky_relu(x, 0.4)
101
- x = self.conv2(x)
102
- x = F.leaky_relu(x, 0.4)
103
- x = torch.squeeze(x, 1)
104
- return x
105
-
106
-
107
- class ResidualBlock(nn.Module):
108
- def __init__(self, n_mels, residual_channels, dilation, uncond=False):
109
- """
110
- :param n_mels: inplanes of conv1x1 for spectrogram conditional
111
- :param residual_channels: audio conv
112
- :param dilation: audio conv dilation
113
- :param uncond: disable spectrogram conditional
114
- """
115
- super().__init__()
116
- self.dilated_conv = Conv1d(
117
- residual_channels,
118
- 2 * residual_channels,
119
- 3,
120
- padding=dilation,
121
- dilation=dilation,
122
- )
123
- self.diffusion_projection = nn.Linear(512, residual_channels)
124
- if not uncond: # conditional model
125
- self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
126
- else: # unconditional model
127
- self.conditioner_projection = None
128
-
129
- self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
130
-
131
- def forward(self, x, diffusion_step, conditioner=None):
132
- assert (conditioner is None and self.conditioner_projection is None) or (
133
- conditioner is not None and self.conditioner_projection is not None
134
- )
135
-
136
- diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
137
- y = x + diffusion_step
138
- if self.conditioner_projection is None: # using a unconditional model
139
- y = self.dilated_conv(y)
140
- else:
141
- conditioner = self.conditioner_projection(conditioner)
142
- y = self.dilated_conv(y) + conditioner
143
-
144
- gate, filter = torch.chunk(y, 2, dim=1)
145
- y = torch.sigmoid(gate) * torch.tanh(filter)
146
-
147
- y = self.output_projection(y)
148
- residual, skip = torch.chunk(y, 2, dim=1)
149
- return (x + residual) / sqrt(2.0), skip
150
-
151
-
152
- class DiffWave(nn.Module):
153
- def __init__(self, params):
154
- super().__init__()
155
- self.params = params
156
- self.input_projection = Conv1d(1, params.residual_channels, 1)
157
- self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
158
- if self.params.unconditional: # use unconditional model
159
- self.spectrogram_upsampler = None
160
- else:
161
- self.spectrogram_upsampler = SpectrogramUpsampler(params.n_mels)
162
-
163
- self.residual_layers = nn.ModuleList(
164
- [
165
- ResidualBlock(
166
- params.n_mels,
167
- params.residual_channels,
168
- 2 ** (i % params.dilation_cycle_length),
169
- uncond=params.unconditional,
170
- )
171
- for i in range(params.residual_layers)
172
- ]
173
- )
174
- self.skip_projection = Conv1d(
175
- params.residual_channels, params.residual_channels, 1
176
- )
177
- self.output_projection = Conv1d(params.residual_channels, 1, 1)
178
- nn.init.zeros_(self.output_projection.weight)
179
-
180
- def forward(self, audio, diffusion_step, spectrogram=None):
181
- assert (spectrogram is None and self.spectrogram_upsampler is None) or (
182
- spectrogram is not None and self.spectrogram_upsampler is not None
183
- )
184
- x = audio.unsqueeze(1)
185
- x = self.input_projection(x)
186
- x = F.relu(x)
187
-
188
- diffusion_step = self.diffusion_embedding(diffusion_step)
189
- if self.spectrogram_upsampler: # use conditional model
190
- spectrogram = self.spectrogram_upsampler(spectrogram)
191
-
192
- skip = None
193
- for layer in self.residual_layers:
194
- x, skip_connection = layer(x, diffusion_step, spectrogram)
195
- skip = skip_connection if skip is None else skip_connection + skip
196
-
197
- x = skip / sqrt(len(self.residual_layers))
198
- x = self.skip_projection(x)
199
- x = F.relu(x)
200
- x = self.output_projection(x)
201
- return x