PyPI - pocket-tts - Versions diffs - 1.0.2__py3-none-any.whl - Mend

pocket-tts 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

pocket_tts/__init__.py +16 -0
pocket_tts/__main__.py +6 -0
pocket_tts/conditioners/__init__.py +0 -0
pocket_tts/conditioners/base.py +38 -0
pocket_tts/conditioners/text.py +61 -0
pocket_tts/config/b6369a24.yaml +57 -0
pocket_tts/data/__init__.py +2 -0
pocket_tts/data/audio.py +144 -0
pocket_tts/data/audio_utils.py +28 -0
pocket_tts/default_parameters.py +7 -0
pocket_tts/main.py +262 -0
pocket_tts/models/__init__.py +3 -0
pocket_tts/models/flow_lm.py +208 -0
pocket_tts/models/mimi.py +111 -0
pocket_tts/models/tts_model.py +782 -0
pocket_tts/modules/__init__.py +1 -0
pocket_tts/modules/conv.py +161 -0
pocket_tts/modules/dummy_quantizer.py +18 -0
pocket_tts/modules/layer_scale.py +11 -0
pocket_tts/modules/mimi_transformer.py +285 -0
pocket_tts/modules/mlp.py +215 -0
pocket_tts/modules/resample.py +46 -0
pocket_tts/modules/rope.py +74 -0
pocket_tts/modules/seanet.py +180 -0
pocket_tts/modules/stateful_module.py +45 -0
pocket_tts/modules/transformer.py +124 -0
pocket_tts/static/index.html +374 -0
pocket_tts/utils/__init__.py +1 -0
pocket_tts/utils/config.py +122 -0
pocket_tts/utils/debugging.py +26 -0
pocket_tts/utils/logging_utils.py +41 -0
pocket_tts/utils/utils.py +103 -0
pocket_tts/utils/weights_loading.py +35 -0
pocket_tts-1.0.2.dist-info/METADATA +174 -0
pocket_tts-1.0.2.dist-info/RECORD +38 -0
pocket_tts-1.0.2.dist-info/WHEEL +4 -0
pocket_tts-1.0.2.dist-info/entry_points.txt +2 -0
pocket_tts-1.0.2.dist-info/licenses/LICENSE +23 -0

pocket_tts/modules/resample.py ADDED Viewed

@@ -0,0 +1,46 @@
+import torch
+from torch import nn
+from pocket_tts.modules.conv import StreamingConv1d, StreamingConvTranspose1d
+class ConvDownsample1d(nn.Module):
+    """
+    Downsampling by some integer amount `stride` using convolutions
+    with a kernel size of twice the stride.
+    """
+    def __init__(self, stride: int, dimension: int):
+        super().__init__()
+        self.conv = StreamingConv1d(
+            dimension,
+            dimension,
+            kernel_size=2 * stride,
+            stride=stride,
+            groups=1,
+            bias=False,
+            pad_mode="replicate",
+        )
+    def forward(self, x: torch.Tensor, model_state: dict | None):
+        return self.conv(x, model_state)
+class ConvTrUpsample1d(nn.Module):
+    """
+    Upsample by some integer amount `stride` using transposed convolutions.
+    """
+    def __init__(self, stride: int, dimension: int):
+        super().__init__()
+        self.convtr = StreamingConvTranspose1d(
+            dimension,
+            dimension,
+            kernel_size=2 * stride,
+            stride=stride,
+            groups=dimension,
+            bias=False,
+        )
+    def forward(self, x: torch.Tensor, model_state: dict | None):
+        return self.convtr(x, model_state)

pocket_tts/modules/rope.py ADDED Viewed

@@ -0,0 +1,74 @@
+import math
+import torch
+from torch import nn
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    offset: int | torch.Tensor = 0,
+    max_period: int | float = 10_000,
+):
+    """
+    Args:
+        q (torch.Tensor): Queries, shape `[B, T, H, D]`.
+        k (torch.Tensor): Keys, shape `[B, T, H, D]`.
+        offset (int): Current offset, e.g. when streaming.
+        max_period (float): Maximum period for the cos and sin.
+    """
+    B, T, H, D = q.shape
+    Bk, Tk, Hk, Dk = k.shape
+    assert (B, T, D) == (Bk, Tk, Dk)
+    assert D > 0
+    assert D % 2 == 0
+    assert max_period > 0
+    ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
+    freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
+    # could be optimized in one call
+    ts = torch.arange(T, device=q.device, dtype=torch.float32)
+    ts += offset
+    ts = ts.view(-1, 1, 1)
+    q = q.view(B, T, H, D // 2, 2)
+    k = k.view(B, T, Hk, D // 2, 2)
+    # convention is `r` suffix is real part, `i` is imaginary.
+    qr = q[..., 0].float()
+    qi = q[..., 1].float()
+    kr = k[..., 0].float()
+    ki = k[..., 1].float()
+    rotr = torch.cos(freqs * ts)
+    roti = torch.sin(freqs * ts)
+    qor = qr * rotr - qi * roti
+    qoi = qr * roti + qi * rotr
+    kor = kr * rotr - ki * roti
+    koi = kr * roti + ki * rotr
+    dtype = q.dtype
+    qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
+    ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)
+    return qo.view(B, T, H, D), ko.view(B, T, Hk, D)
+class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+    Args:
+        max_period (float): Maximum period of the rotation frequencies.
+    """
+    def __init__(self, max_period: float | int = 10000.0):
+        super().__init__()
+        self.max_period = max_period
+    def forward(self, q: torch.Tensor, k: torch.Tensor, offset: torch.Tensor | int):
+        """Apply rope rotation to query or key tensor."""
+        return apply_rope(q, k, offset, self.max_period)

pocket_tts/modules/seanet.py ADDED Viewed

@@ -0,0 +1,180 @@
+import numpy as np
+import torch.nn as nn
+from .conv import StreamingConv1d, StreamingConvTranspose1d
+class SEANetResnetBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        kernel_sizes: list[int] = [3, 1],
+        dilations: list[int] = [1, 1],
+        pad_mode: str = "reflect",
+        compress: int = 2,
+    ):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), (
+            "Number of kernel sizes should match number of dilations"
+        )
+        hidden = dim // compress
+        block = nn.ModuleList([])
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                nn.ELU(alpha=1.0),
+                StreamingConv1d(
+                    in_chs, out_chs, kernel_size=kernel_size, dilation=dilation, pad_mode=pad_mode
+                ),
+            ]
+        self.block = block
+    def forward(self, x, model_state: dict | None):
+        v = x
+        for layer in self.block:
+            if isinstance(layer, StreamingConv1d):
+                v = layer(v, model_state)
+            else:
+                v = layer(v)
+        assert x.shape == v.shape, (x.shape, v.shape, x.shape)
+        return x + v
+class SEANetEncoder(nn.Module):
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: list[int] = [8, 5, 4, 2],
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        pad_mode: str = "reflect",
+        compress: int = 2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        mult = 1
+        model = nn.ModuleList(
+            [StreamingConv1d(channels, mult * n_filters, kernel_size, pad_mode=pad_mode)]
+        )
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        pad_mode=pad_mode,
+                        compress=compress,
+                    )
+                ]
+            # Add downsampling layers
+            model += [
+                nn.ELU(alpha=1.0),
+                StreamingConv1d(
+                    mult * n_filters,
+                    mult * n_filters * 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    pad_mode=pad_mode,
+                ),
+            ]
+            mult *= 2
+        model += [
+            nn.ELU(alpha=1.0),
+            StreamingConv1d(mult * n_filters, dimension, last_kernel_size, pad_mode=pad_mode),
+        ]
+        self.model = model
+    def forward(self, x, model_state: dict | None):
+        for layer in self.model:
+            if isinstance(layer, (StreamingConv1d, SEANetResnetBlock)):
+                x = layer(x, model_state)
+            else:
+                x = layer(x)
+        return x
+class SEANetDecoder(nn.Module):
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: list[int] = [8, 5, 4, 2],
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        pad_mode: str = "reflect",
+        compress: int = 2,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        mult = int(2 ** len(self.ratios))
+        model = nn.ModuleList(
+            [StreamingConv1d(dimension, mult * n_filters, kernel_size, pad_mode=pad_mode)]
+        )
+        # Upsample to raw audio scale
+        for _, ratio in enumerate(self.ratios):
+            # Add upsampling layers
+            model += [
+                nn.ELU(alpha=1.0),
+                StreamingConvTranspose1d(
+                    mult * n_filters, mult * n_filters // 2, kernel_size=ratio * 2, stride=ratio
+                ),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters // 2,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        pad_mode=pad_mode,
+                        compress=compress,
+                    )
+                ]
+            mult //= 2
+        # Add final layers
+        model += [
+            nn.ELU(alpha=1.0),
+            StreamingConv1d(n_filters, channels, last_kernel_size, pad_mode=pad_mode),
+        ]
+        self.model = model
+    def forward(self, z, model_state: dict | None):
+        for layer in self.model:
+            if isinstance(layer, (StreamingConvTranspose1d, SEANetResnetBlock, StreamingConv1d)):
+                z = layer(z, model_state)
+            else:
+                z = layer(z)
+        return z

pocket_tts/modules/stateful_module.py ADDED Viewed

@@ -0,0 +1,45 @@
+from abc import ABC, abstractmethod
+import torch
+from torch import nn
+def init_states(
+    model: nn.Module, batch_size: int, sequence_length: int
+) -> dict[str, dict[str, torch.Tensor]]:
+    result = {}
+    for module_name, module in model.named_modules():
+        if not isinstance(module, StatefulModule):
+            continue
+        module._module_absolute_name = module_name
+        module_state = module.init_state(batch_size, sequence_length=sequence_length)
+        result[module_name] = module_state
+    return result
+def increment_steps(
+    module: nn.Module, model_state: dict[str, dict[str, torch.Tensor]], increment: int = 1
+):
+    # print("incrementing steps by", increment)
+    for module_name, module in module.named_modules():
+        if not isinstance(module, StatefulModule):
+            continue
+        module.increment_step(model_state[module_name], increment)
+class StatefulModule(ABC, nn.Module):
+    def __init__(self, *args, **kwds):
+        self._module_absolute_name = None
+        return super().__init__(*args, **kwds)
+    @abstractmethod
+    def init_state(self, batch_size: int, sequence_length: int):
+        """Initialize the state."""
+        raise NotImplementedError
+    def increment_step(self, state: dict, increment: int = 1):
+        pass
+    def get_state(self, model_state: dict[str, dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
+        """Get the state for this module from the model state."""
+        return model_state[self._module_absolute_name]

pocket_tts/modules/transformer.py ADDED Viewed

@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from pocket_tts.modules.rope import RotaryEmbedding
+from pocket_tts.modules.stateful_module import StatefulModule
+def complete_kv(
+    cache: torch.Tensor, current_end: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    current_end = current_end.shape[0]
+    cache[0, :, current_end : current_end + k.shape[1]] = k
+    cache[1, :, current_end : current_end + v.shape[1]] = v
+    valid = cache[:, :, : current_end + k.shape[1]]
+    return valid[0], valid[1]
+def _materialize_causal_mask(
+    shape: tuple[int, ...], shift: int, device: str | torch.device = "cpu"
+) -> torch.Tensor:
+    dtype = torch.float32
+    num_queries, num_keys = shape[-2:]
+    shift = num_keys - num_queries
+    tensor = torch.full(shape, dtype=dtype, fill_value=1, device=device)
+    mask = torch.tril(tensor, diagonal=shift).to(dtype)
+    mask = torch.log(mask)
+    return mask.to(dtype)
+class StreamingMultiheadAttention(StatefulModule):
+    """Similar to `nn.MultiheadAttention` but with support for streaming.
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        context (int, optional): Number of time steps the attention can access to.
+            Can access `context` time steps into the past.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    """
+    def __init__(self, embed_dim: int, num_heads: int, rope: RotaryEmbedding):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.rope = rope
+        self.num_heads = num_heads
+        out_dim = embed_dim
+        num_kv = num_heads
+        kv_dim = (embed_dim // num_heads) * num_kv
+        out_dim += 2 * kv_dim
+        mult = 1
+        self.in_proj = nn.Linear(embed_dim, mult * out_dim, bias=False)
+        self.out_proj = nn.Linear(embed_dim, mult * embed_dim, bias=False)
+    def _get_mask(self, shape: tuple[int, int], shift: int, device: torch.device) -> torch.Tensor:
+        return _materialize_causal_mask(shape, shift=shift, device=device)
+    def init_state(self, batch_size: int, sequence_length: int) -> dict[str, torch.Tensor]:
+        dim_per_head = self.embed_dim // self.num_heads
+        initial_current_end = torch.zeros((0,)).to(self.in_proj.weight.device)
+        return dict(
+            current_end=initial_current_end,
+            cache=torch.full(
+                (2, batch_size, sequence_length, self.num_heads, dim_per_head),
+                float("NaN"),
+                device=self.in_proj.weight.device,
+                dtype=self.in_proj.weight.dtype,
+            ),
+        )
+    def increment_step(self, state: dict, increment: int = 1):
+        new_size = state["current_end"].shape[0] + increment
+        state["current_end"] = torch.zeros((new_size,)).to(state["current_end"].device)
+    def _complete_kv(self, k, v, state: dict | None):
+        k, v = complete_kv(state["cache"], state["current_end"], k, v)
+        return k, v
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor, state: dict | None):
+        # Apply rope embeddings to query and key tensors.
+        streaming_offset = self._streaming_offset(state)
+        return self.rope(query, key, offset=streaming_offset)
+    def _streaming_offset(self, state: dict | None) -> torch.Tensor | int:
+        return state["current_end"].shape[0]
+    def check_model_state(self, model_state: dict):
+        if model_state is None:
+            raise ValueError("model_state must be provided")
+        return self.get_state(model_state)
+    def forward(self, query: torch.Tensor, model_state: dict | None):
+        state = self.check_model_state(model_state)
+        projected = self.in_proj(query)
+        # Reshape from (b, t, p*h*d) to (b, t, p, h, d) where p=3, h=num_heads
+        b, t, _ = projected.shape
+        d = self.embed_dim // self.num_heads
+        packed = projected.view(b, t, 3, self.num_heads, d)
+        q, k, v = torch.unbind(packed, dim=2)
+        q, k = self._apply_rope(q, k, state)
+        k, v = self._complete_kv(k, v, state)
+        mask_shape = (query.shape[1], query.shape[1] + state["current_end"].shape[0])
+        shift = state["current_end"].shape[0]
+        attn_mask = self._get_mask(mask_shape, shift=shift, device=q.device)
+        q, k, v = [x.transpose(1, 2) for x in (q, k, v)]
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask)
+        x = x.transpose(1, 2)
+        # Reshape from (b, t, h, d) to (b, t, h*d)
+        b, t, h, d = x.shape
+        x = x.reshape(b, t, h * d)
+        x = self.out_proj(x)
+        return x