PyPI - xinference - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

xinference 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (334) hide show

xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py ADDED Viewed

@@ -0,0 +1,656 @@
+"""A popular speaker recognition and diarization model.
+Authors
+ * Hwidong Na 2020
+"""
+import torch  # noqa: F401
+import torch.nn as nn
+import torch.nn.functional as F
+from indextts.BigVGAN.nnet.CNN import Conv1d as _Conv1d
+from indextts.BigVGAN.nnet.linear import Linear
+from indextts.BigVGAN.nnet.normalization import BatchNorm1d as _BatchNorm1d
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    """Creates a binary mask for each sequence.
+    Reference: https://discuss.pytorch.org/t/how-to-generate-variable-length-mask/23397/3
+    Arguments
+    ---------
+    length : torch.LongTensor
+        Containing the length of each sequence in the batch. Must be 1D.
+    max_len : int
+        Max length for the mask, also the size of the second dimension.
+    dtype : torch.dtype, default: None
+        The dtype of the generated mask.
+    device: torch.device, default: None
+        The device to put the mask variable.
+    Returns
+    -------
+    mask : tensor
+        The binary mask.
+    Example
+    -------
+    >>> length=torch.Tensor([1,2,3])
+    >>> mask=length_to_mask(length)
+    >>> mask
+    tensor([[1., 0., 0.],
+            [1., 1., 0.],
+            [1., 1., 1.]])
+    """
+    assert len(length.shape) == 1
+    if max_len is None:
+        max_len = length.max().long().item()  # using arange to generate mask
+    mask = torch.arange(
+        max_len, device=length.device, dtype=length.dtype
+    ).expand(len(length), max_len) < length.unsqueeze(1)
+    if dtype is None:
+        dtype = length.dtype
+    if device is None:
+        device = length.device
+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
+    return mask
+# Skip transpose as much as possible for efficiency
+class Conv1d(_Conv1d):
+    """1D convolution. Skip transpose is used to improve efficiency."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+class BatchNorm1d(_BatchNorm1d):
+    """1D batch normalization. Skip transpose is used to improve efficiency."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+class TDNNBlock(nn.Module):
+    """An implementation of TDNN.
+    Arguments
+    ---------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        The number of output channels.
+    kernel_size : int
+        The kernel size of the TDNN blocks.
+    dilation : int
+        The dilation of the TDNN block.
+    activation : torch class
+        A class for constructing the activation layers.
+    groups : int
+        The groups size of the TDNN blocks.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        activation=nn.ReLU,
+        groups=1,
+    ):
+        super().__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+    def forward(self, x):
+        """Processes the input tensor x and returns an output tensor."""
+        return self.norm(self.activation(self.conv(x)))
+class Res2NetBlock(torch.nn.Module):
+    """An implementation of Res2NetBlock w/ dilation.
+    Arguments
+    ---------
+    in_channels : int
+        The number of channels expected in the input.
+    out_channels : int
+        The number of output channels.
+    scale : int
+        The scale of the Res2Net block.
+    kernel_size: int
+        The kernel size of the Res2Net block.
+    dilation : int
+        The dilation of the Res2Net block.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1
+    ):
+        super().__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+        self.blocks = nn.ModuleList(
+            [
+                TDNNBlock(
+                    in_channel,
+                    hidden_channel,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                )
+                for i in range(scale - 1)
+            ]
+        )
+        self.scale = scale
+    def forward(self, x):
+        """Processes the input tensor x and returns an output tensor."""
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = torch.cat(y, dim=1)
+        return y
+class SEBlock(nn.Module):
+    """An implementation of squeeze-and-excitation block.
+    Arguments
+    ---------
+    in_channels : int
+        The number of input channels.
+    se_channels : int
+        The number of output channels after squeeze.
+    out_channels : int
+        The number of output channels.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> se_layer = SEBlock(64, 16, 64)
+    >>> lengths = torch.rand((8,))
+    >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(self, in_channels, se_channels, out_channels):
+        super().__init__()
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1
+        )
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1
+        )
+        self.sigmoid = torch.nn.Sigmoid()
+    def forward(self, x, lengths=None):
+        """Processes the input tensor x and returns an output tensor."""
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(dim=2, keepdim=True)
+            s = (x * mask).sum(dim=2, keepdim=True) / total
+        else:
+            s = x.mean(dim=2, keepdim=True)
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+        return s * x
+class AttentiveStatisticsPooling(nn.Module):
+    """This class implements an attentive statistic pooling layer for each channel.
+    It returns the concatenated mean and std of the input tensor.
+    Arguments
+    ---------
+    channels: int
+        The number of input channels.
+    attention_channels: int
+        The number of attention channels.
+    global_context: bool
+        Whether to use global context.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> asp_layer = AttentiveStatisticsPooling(64)
+    >>> lengths = torch.rand((8,))
+    >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 1, 128])
+    """
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels, out_channels=channels, kernel_size=1
+        )
+    def forward(self, x, lengths=None):
+        """Calculates mean and std for a batch (input tensor).
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape [N, C, L].
+        lengths : torch.Tensor
+            The corresponding relative lengths of the inputs.
+        Returns
+        -------
+        pooled_stats : torch.Tensor
+            mean and std of batch
+        """
+        L = x.shape[-1]
+        def _compute_statistics(x, m, dim=2, eps=self.eps):
+            mean = (m * x).sum(dim)
+            std = torch.sqrt(
+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
+            )
+            return mean, std
+        if lengths is None:
+            lengths = torch.ones(x.shape[0], device=x.device)
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+        mask = mask.unsqueeze(1)
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            # torch.std is unstable for backward computation
+            # https://github.com/pytorch/pytorch/issues/4320
+            total = mask.sum(dim=2, keepdim=True).float()
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).repeat(1, 1, L)
+            std = std.unsqueeze(2).repeat(1, 1, L)
+            attn = torch.cat([x, mean, std], dim=1)
+        else:
+            attn = x
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float("-inf"))
+        attn = F.softmax(attn, dim=2)
+        mean, std = _compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+        return pooled_stats
+class SERes2NetBlock(nn.Module):
+    """An implementation of building block in ECAPA-TDNN, i.e.,
+    TDNN-Res2Net-TDNN-SEBlock.
+    Arguments
+    ---------
+    in_channels: int
+        Expected size of input channels.
+    out_channels: int
+        The number of output channels.
+    res2net_scale: int
+        The scale of the Res2Net block.
+    se_channels : int
+        The number of output channels after squeeze.
+    kernel_size: int
+        The kernel size of the TDNN blocks.
+    dilation: int
+        The dilation of the Res2Net block.
+    activation : torch class
+        A class for constructing the activation layers.
+    groups: int
+        Number of blocked connections from input channels to output channels.
+    Example
+    -------
+    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
+    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
+    >>> out = conv(x).transpose(1, 2)
+    >>> out.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+        activation=torch.nn.ReLU,
+        groups=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.res2net_block = Res2NetBlock(
+            out_channels, out_channels, res2net_scale, kernel_size, dilation
+        )
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+    def forward(self, x, lengths=None):
+        """Processes the input tensor x and returns an output tensor."""
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+        return x + residual
+class ECAPA_TDNN(torch.nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
+    Arguments
+    ---------
+    input_size : int
+        Expected size of the input dimension.
+    device : str
+        Device used, e.g., "cpu" or "cuda".
+    lin_neurons : int
+        Number of neurons in linear layers.
+    activation : torch class
+        A class for constructing the activation layers.
+    channels : list of ints
+        Output channels for TDNN/SERes2Net layer.
+    kernel_sizes : list of ints
+        List of kernel sizes for each layer.
+    dilations : list of ints
+        List of dilations for kernels in each layer.
+    attention_channels: int
+        The number of attention channels.
+    res2net_scale : int
+        The scale of the Res2Net block.
+    se_channels : int
+        The number of output channels after squeeze.
+    global_context: bool
+        Whether to use global context.
+    groups : list of ints
+        List of groups for kernels in each layer.
+    Example
+    -------
+    >>> input_feats = torch.rand([5, 120, 80])
+    >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
+    >>> outputs = compute_embedding(input_feats)
+    >>> outputs.shape
+    torch.Size([5, 1, 192])
+    """
+    def __init__(
+        self,
+        input_size,
+        device="cpu",
+        lin_neurons=192,
+        activation=torch.nn.ReLU,
+        channels=[512, 512, 512, 512, 1536],
+        kernel_sizes=[5, 3, 3, 3, 1],
+        dilations=[1, 2, 3, 4, 1],
+        attention_channels=128,
+        res2net_scale=8,
+        se_channels=128,
+        global_context=True,
+        groups=[1, 1, 1, 1, 1],
+    ):
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.ModuleList()
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation,
+                groups[0],
+            )
+        )
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation,
+                    groups=groups[i],
+                )
+            )
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-2] * (len(channels) - 2),
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation,
+            groups=groups[-1],
+        )
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context,
+        )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=lin_neurons,
+            kernel_size=1,
+        )
+    def forward(self, x, lengths=None):
+        """Returns the embedding vector.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, time, channel).
+        lengths : torch.Tensor
+            Corresponding relative lengths of inputs.
+        Returns
+        -------
+        x : torch.Tensor
+            Embedding vector.
+        """
+        # Minimize transpose for efficiency
+        x = x.transpose(1, 2)
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+        # Final linear transformation
+        x = self.fc(x)
+        x = x.transpose(1, 2)
+        return x
+class Classifier(torch.nn.Module):
+    """This class implements the cosine similarity on the top of features.
+    Arguments
+    ---------
+    input_size : int
+        Expected size of input dimension.
+    device : str
+        Device used, e.g., "cpu" or "cuda".
+    lin_blocks : int
+        Number of linear layers.
+    lin_neurons : int
+        Number of neurons in linear layers.
+    out_neurons : int
+        Number of classes.
+    Example
+    -------
+    >>> classify = Classifier(input_size=2, lin_neurons=2, out_neurons=2)
+    >>> outputs = torch.tensor([ [1., -1.], [-9., 1.], [0.9, 0.1], [0.1, 0.9] ])
+    >>> outputs = outputs.unsqueeze(1)
+    >>> cos = classify(outputs)
+    >>> (cos < -1.0).long().sum()
+    tensor(0)
+    >>> (cos > 1.0).long().sum()
+    tensor(0)
+    """
+    def __init__(
+        self,
+        input_size,
+        device="cpu",
+        lin_blocks=0,
+        lin_neurons=192,
+        out_neurons=1211,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for block_index in range(lin_blocks):
+            self.blocks.extend(
+                [
+                    _BatchNorm1d(input_size=input_size),
+                    Linear(input_size=input_size, n_neurons=lin_neurons),
+                ]
+            )
+            input_size = lin_neurons
+        # Final Layer
+        self.weight = nn.Parameter(
+            torch.FloatTensor(out_neurons, input_size, device=device)
+        )
+        nn.init.xavier_uniform_(self.weight)
+    def forward(self, x):
+        """Returns the output probabilities over speakers.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Torch tensor.
+        Returns
+        -------
+        out : torch.Tensor
+            Output probabilities over speakers.
+        """
+        for layer in self.blocks:
+            x = layer(x)
+        # Need to be normalized
+        x = F.linear(F.normalize(x.squeeze(1)), F.normalize(self.weight))
+        return x.unsqueeze(1)

xinference/thirdparty/indextts/BigVGAN/__init__.py ADDED Viewed

File without changes

xinference 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl

Potentially problematic release.

xinference 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl