PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/thirdparty/cosyvoice/flow/decoder.py CHANGED Viewed

@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple, Optional, Dict, Any
+from typing import Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import pack, rearrange, repeat
-from diffusers.models.attention_processor import Attention, AttnProcessor2_0, inspect, logger, deprecate
 from cosyvoice.utils.common import mask_to_bias
 from cosyvoice.utils.mask import add_optional_chunk_mask
 from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
-from matcha.models.components.transformer import BasicTransformerBlock, maybe_allow_in_graph
+from matcha.models.components.transformer import BasicTransformerBlock
 class Transpose(torch.nn.Module):
@@ -29,7 +28,7 @@ class Transpose(torch.nn.Module):
         self.dim0 = dim0
         self.dim1 = dim1
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = torch.transpose(x, self.dim0, self.dim1)
         return x
@@ -57,15 +56,10 @@ class CausalConv1d(torch.nn.Conv1d):
         assert stride == 1
         self.causal_padding = kernel_size - 1
-    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
-        if cache.size(2) == 0:
-            x = F.pad(x, (self.causal_padding, 0), value=0.0)
-        else:
-            assert cache.size(2) == self.causal_padding
-            x = torch.concat([cache, x], dim=2)
-        cache = x[:, :, -self.causal_padding:]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.pad(x, (self.causal_padding, 0), value=0.0)
         x = super(CausalConv1d, self).forward(x)
-        return x, cache
+        return x
 class CausalBlock1D(Block1D):
@@ -79,11 +73,9 @@ class CausalBlock1D(Block1D):
             nn.Mish(),
         )
-    def forward(self, x: torch.Tensor, mask: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
-        output, cache = self.block[0](x * mask, cache)
-        for i in range(1, len(self.block)):
-            output = self.block[i](output)
-        return output * mask, cache
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        output = self.block(x * mask)
+        return output * mask
 class CausalResnetBlock1D(ResnetBlock1D):
@@ -92,303 +84,6 @@ class CausalResnetBlock1D(ResnetBlock1D):
         self.block1 = CausalBlock1D(dim, dim_out)
         self.block2 = CausalBlock1D(dim_out, dim_out)
-    def forward(self, x: torch.Tensor, mask: torch.Tensor, time_emb: torch.Tensor,
-                block1_cache: torch.Tensor = torch.zeros(0, 0, 0), block2_cache: torch.Tensor = torch.zeros(0, 0, 0)
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        h, block1_cache = self.block1(x, mask, block1_cache)
-        h += self.mlp(time_emb).unsqueeze(-1)
-        h, block2_cache = self.block2(h, mask, block2_cache)
-        output = h + self.res_conv(x * mask)
-        return output, block1_cache, block2_cache
-class CausalAttnProcessor2_0(AttnProcessor2_0):
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-    def __init__(self):
-        super(CausalAttnProcessor2_0, self).__init__()
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-        *args,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, torch.Tensor]:
-        if len(args) > 0 or kwargs.get("scale", None) is not None:
-            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. \
-                `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
-            deprecate("scale", "1.0.0", deprecation_message)
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            # NOTE do not use attn.prepare_attention_mask as we have already provided the correct attention_mask
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.unsqueeze(dim=1).repeat(1, attn.heads, 1, 1)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key_cache = attn.to_k(encoder_hidden_states)
-        value_cache = attn.to_v(encoder_hidden_states)
-        # NOTE here we judge cache.size(0) instead of cache.size(1), because init_cache has size (2, 0, 512, 2)
-        if cache.size(0) != 0:
-            key = torch.concat([cache[:, :, :, 0], key_cache], dim=1)
-            value = torch.concat([cache[:, :, :, 1], value_cache], dim=1)
-        else:
-            key, value = key_cache, value_cache
-        cache = torch.stack([key_cache, value_cache], dim=3)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states, cache
-@maybe_allow_in_graph
-class CausalAttention(Attention):
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        cross_attention_norm: Optional[str] = None,
-        cross_attention_norm_num_groups: int = 32,
-        qk_norm: Optional[str] = None,
-        added_kv_proj_dim: Optional[int] = None,
-        norm_num_groups: Optional[int] = None,
-        spatial_norm_dim: Optional[int] = None,
-        out_bias: bool = True,
-        scale_qk: bool = True,
-        only_cross_attention: bool = False,
-        eps: float = 1e-5,
-        rescale_output_factor: float = 1.0,
-        residual_connection: bool = False,
-        _from_deprecated_attn_block: bool = False,
-        processor: Optional["AttnProcessor2_0"] = None,
-        out_dim: int = None,
-    ):
-        super(CausalAttention, self).__init__(query_dim, cross_attention_dim, heads, dim_head, dropout, bias, upcast_attention, upcast_softmax,
-                                              cross_attention_norm, cross_attention_norm_num_groups, qk_norm, added_kv_proj_dim, norm_num_groups,
-                                              spatial_norm_dim, out_bias, scale_qk, only_cross_attention, eps, rescale_output_factor, residual_connection,
-                                              _from_deprecated_attn_block, processor, out_dim)
-        processor = CausalAttnProcessor2_0()
-        self.set_processor(processor)
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-        **cross_attention_kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        r"""
-        The forward method of the `Attention` class.
-        Args:
-            hidden_states (`torch.Tensor`):
-                The hidden states of the query.
-            encoder_hidden_states (`torch.Tensor`, *optional*):
-                The hidden states of the encoder.
-            attention_mask (`torch.Tensor`, *optional*):
-                The attention mask to use. If `None`, no mask is applied.
-            **cross_attention_kwargs:
-                Additional keyword arguments to pass along to the cross attention.
-        Returns:
-            `torch.Tensor`: The output of the attention layer.
-        """
-        # The `Attention` class can call different attention processors / attention functions
-        # here we simply pass along all tensors to the selected processor class
-        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        unused_kwargs = [k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters]
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
-            )
-        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
-        return self.processor(
-            self,
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            cache=cache,
-            **cross_attention_kwargs,
-        )
-@maybe_allow_in_graph
-class CausalBasicTransformerBlock(BasicTransformerBlock):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
-    ):
-        super(CausalBasicTransformerBlock, self).__init__(dim, num_attention_heads, attention_head_dim, dropout,
-                                                          cross_attention_dim, activation_fn, num_embeds_ada_norm,
-                                                          attention_bias, only_cross_attention, double_self_attention,
-                                                          upcast_attention, norm_elementwise_affine, norm_type, final_dropout)
-        self.attn1 = CausalAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        attn_output, cache = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
-            cache=cache,
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-        # 2. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
-                raise ValueError(f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: \
-                                 {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`.")
-            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
-            ff_output = torch.cat(
-                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
-                dim=self._chunk_dim,
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = ff_output + hidden_states
-        return hidden_states, cache
 class ConditionalDecoder(nn.Module):
     def __init__(
@@ -640,7 +335,7 @@ class CausalConditionalDecoder(ConditionalDecoder):
             resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
             transformer_blocks = nn.ModuleList(
                 [
-                    CausalBasicTransformerBlock(
+                    BasicTransformerBlock(
                         dim=output_channel,
                         num_attention_heads=num_heads,
                         attention_head_dim=attention_head_dim,
@@ -662,7 +357,7 @@ class CausalConditionalDecoder(ConditionalDecoder):
             transformer_blocks = nn.ModuleList(
                 [
-                    CausalBasicTransformerBlock(
+                    BasicTransformerBlock(
                         dim=output_channel,
                         num_attention_heads=num_heads,
                         attention_head_dim=attention_head_dim,
@@ -687,7 +382,7 @@ class CausalConditionalDecoder(ConditionalDecoder):
             )
             transformer_blocks = nn.ModuleList(
                 [
-                    CausalBasicTransformerBlock(
+                    BasicTransformerBlock(
                         dim=output_channel,
                         num_attention_heads=num_heads,
                         attention_head_dim=attention_head_dim,
@@ -724,7 +419,6 @@ class CausalConditionalDecoder(ConditionalDecoder):
         Returns:
             _type_: _description_
         """
         t = self.time_embeddings(t).to(t.dtype)
         t = self.time_mlp(t)
@@ -740,36 +434,36 @@ class CausalConditionalDecoder(ConditionalDecoder):
         masks = [mask]
         for resnet, transformer_blocks, downsample in self.down_blocks:
             mask_down = masks[-1]
-            x, _, _ = resnet(x, mask_down, t)
+            x = resnet(x, mask_down, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
             if streaming is True:
-                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, self.num_decoding_left_chunks)
+                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
             else:
                 attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
             attn_mask = mask_to_bias(attn_mask, x.dtype)
             for transformer_block in transformer_blocks:
-                x, _ = transformer_block(
+                x = transformer_block(
                     hidden_states=x,
                     attention_mask=attn_mask,
                     timestep=t,
                 )
             x = rearrange(x, "b t c -> b c t").contiguous()
             hiddens.append(x)  # Save hidden states for skip connections
-            x, _ = downsample(x * mask_down)
+            x = downsample(x * mask_down)
             masks.append(mask_down[:, :, ::2])
         masks = masks[:-1]
         mask_mid = masks[-1]
         for resnet, transformer_blocks in self.mid_blocks:
-            x, _, _ = resnet(x, mask_mid, t)
+            x = resnet(x, mask_mid, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
             if streaming is True:
-                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, self.num_decoding_left_chunks)
+                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
             else:
                 attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
             attn_mask = mask_to_bias(attn_mask, x.dtype)
             for transformer_block in transformer_blocks:
-                x, _ = transformer_block(
+                x = transformer_block(
                     hidden_states=x,
                     attention_mask=attn_mask,
                     timestep=t,
@@ -780,124 +474,21 @@ class CausalConditionalDecoder(ConditionalDecoder):
             mask_up = masks.pop()
             skip = hiddens.pop()
             x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
-            x, _, _ = resnet(x, mask_up, t)
+            x = resnet(x, mask_up, t)
             x = rearrange(x, "b c t -> b t c").contiguous()
             if streaming is True:
-                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, self.num_decoding_left_chunks)
+                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
             else:
                 attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
             attn_mask = mask_to_bias(attn_mask, x.dtype)
             for transformer_block in transformer_blocks:
-                x, _ = transformer_block(
+                x = transformer_block(
                     hidden_states=x,
                     attention_mask=attn_mask,
                     timestep=t,
                 )
             x = rearrange(x, "b t c -> b c t").contiguous()
-            x, _ = upsample(x * mask_up)
-        x, _ = self.final_block(x, mask_up)
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
         output = self.final_proj(x * mask_up)
         return output * mask
-    @torch.inference_mode()
-    def forward_chunk(self, x, mask, mu, t, spks=None, cond=None,
-                      down_blocks_conv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-                      down_blocks_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0, 0),
-                      mid_blocks_conv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-                      mid_blocks_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0, 0),
-                      up_blocks_conv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
-                      up_blocks_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0, 0),
-                      final_blocks_conv_cache: torch.Tensor = torch.zeros(0, 0, 0)
-                      ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Forward pass of the UNet1DConditional model.
-        Args:
-            x (torch.Tensor): shape (batch_size, in_channels, time)
-            mask (_type_): shape (batch_size, 1, time)
-            t (_type_): shape (batch_size)
-            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
-            cond (_type_, optional): placeholder for future use. Defaults to None.
-        Raises:
-            ValueError: _description_
-            ValueError: _description_
-        Returns:
-            _type_: _description_
-        """
-        t = self.time_embeddings(t).to(t.dtype)
-        t = self.time_mlp(t)
-        x = pack([x, mu], "b * t")[0]
-        if spks is not None:
-            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
-            x = pack([x, spks], "b * t")[0]
-        if cond is not None:
-            x = pack([x, cond], "b * t")[0]
-        hiddens = []
-        masks = [mask]
-        down_blocks_kv_cache_new = torch.zeros(1, 4, 2, x.size(2), 512, 2).to(x.device)
-        mid_blocks_kv_cache_new = torch.zeros(12, 4, 2, x.size(2), 512, 2).to(x.device)
-        up_blocks_kv_cache_new = torch.zeros(1, 4, 2, x.size(2), 512, 2).to(x.device)
-        for index, (resnet, transformer_blocks, downsample) in enumerate(self.down_blocks):
-            mask_down = masks[-1]
-            x, down_blocks_conv_cache[index][:, :320], down_blocks_conv_cache[index][:, 320: 576] = \
-                resnet(x, mask_down, t, down_blocks_conv_cache[index][:, :320], down_blocks_conv_cache[index][:, 320: 576])
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.ones(x.size(0), x.size(1), x.size(1) + down_blocks_kv_cache.size(3), device=x.device).bool()
-            attn_mask = mask_to_bias(attn_mask, x.dtype)
-            for i, transformer_block in enumerate(transformer_blocks):
-                x, down_blocks_kv_cache_new[index, i] = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                    cache=down_blocks_kv_cache[index, i],
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            hiddens.append(x)  # Save hidden states for skip connections
-            x, down_blocks_conv_cache[index][:, 576:] = downsample(x * mask_down, down_blocks_conv_cache[index][:, 576:])
-            masks.append(mask_down[:, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        for index, (resnet, transformer_blocks) in enumerate(self.mid_blocks):
-            x, mid_blocks_conv_cache[index][:, :256], mid_blocks_conv_cache[index][:, 256:] = \
-                resnet(x, mask_mid, t, mid_blocks_conv_cache[index][:, :256], mid_blocks_conv_cache[index][:, 256:])
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.ones(x.size(0), x.size(1), x.size(1) + mid_blocks_kv_cache.size(3), device=x.device).bool()
-            attn_mask = mask_to_bias(attn_mask, x.dtype)
-            for i, transformer_block in enumerate(transformer_blocks):
-                x, mid_blocks_kv_cache_new[index, i] = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                    cache=mid_blocks_kv_cache[index, i]
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-        for index, (resnet, transformer_blocks, upsample) in enumerate(self.up_blocks):
-            mask_up = masks.pop()
-            skip = hiddens.pop()
-            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
-            x, up_blocks_conv_cache[index][:, :512], up_blocks_conv_cache[index][:, 512: 768] = \
-                resnet(x, mask_up, t, up_blocks_conv_cache[index][:, :512], up_blocks_conv_cache[index][:, 512: 768])
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            attn_mask = torch.ones(x.size(0), x.size(1), x.size(1) + up_blocks_kv_cache.size(3), device=x.device).bool()
-            attn_mask = mask_to_bias(attn_mask, x.dtype)
-            for i, transformer_block in enumerate(transformer_blocks):
-                x, up_blocks_kv_cache_new[index, i] = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                    cache=up_blocks_kv_cache[index, i]
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            x, up_blocks_conv_cache[index][:, 768:] = upsample(x * mask_up, up_blocks_conv_cache[index][:, 768:])
-        x, final_blocks_conv_cache = self.final_block(x, mask_up, final_blocks_conv_cache)
-        output = self.final_proj(x * mask_up)
-        return output * mask, down_blocks_conv_cache, down_blocks_kv_cache_new, mid_blocks_conv_cache, mid_blocks_kv_cache_new, \
-            up_blocks_conv_cache, up_blocks_kv_cache_new, final_blocks_conv_cache

xinference/thirdparty/cosyvoice/flow/flow.py CHANGED Viewed

@@ -92,7 +92,6 @@ class MaskedDiffWithXvec(torch.nn.Module):
         mask = (~make_pad_mask(feat_len)).to(h)
         # NOTE this is unnecessary, feat/h already same shape
-        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
         loss, _ = self.decoder.compute_loss(
             feat.transpose(1, 2).contiguous(),
             mask.unsqueeze(1),
@@ -214,7 +213,6 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         h = self.encoder_proj(h)
         # get conditions
-        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
         conds = torch.zeros(feat.shape, device=token.device)
         for i, j in enumerate(feat_len):
             if random.random() < 0.5:
@@ -243,7 +241,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
                   prompt_feat,
                   prompt_feat_len,
                   embedding,
-                  cache,
+                  streaming,
                   finalize):
         assert token.shape[0] == 1
         # xvec projection
@@ -257,16 +255,10 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         # text encode
         if finalize is True:
-            h, h_lengths, encoder_cache = self.encoder.forward_chunk(token, token_len, **cache['encoder_cache'])
+            h, h_lengths = self.encoder(token, token_len, streaming=streaming)
         else:
             token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
-            h, h_lengths, encoder_cache = self.encoder.forward_chunk(token, token_len, context=context, **cache['encoder_cache'])
-        cache['encoder_cache']['offset'] = encoder_cache[0]
-        cache['encoder_cache']['pre_lookahead_layer_conv2_cache'] = encoder_cache[1]
-        cache['encoder_cache']['encoders_kv_cache'] = encoder_cache[2]
-        cache['encoder_cache']['upsample_offset'] = encoder_cache[3]
-        cache['encoder_cache']['upsample_conv_cache'] = encoder_cache[4]
-        cache['encoder_cache']['upsample_kv_cache'] = encoder_cache[5]
+            h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming)
         mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
         h = self.encoder_proj(h)
@@ -276,14 +268,14 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         conds = conds.transpose(1, 2)
         mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
-        feat, cache['decoder_cache'] = self.decoder(
+        feat, _ = self.decoder(
             mu=h.transpose(1, 2).contiguous(),
             mask=mask.unsqueeze(1),
             spks=embedding,
             cond=conds,
             n_timesteps=10,
-            cache=cache['decoder_cache']
+            streaming=streaming
         )
         feat = feat[:, :, mel_len1:]
         assert feat.shape[2] == mel_len2
-        return feat.float(), cache
+        return feat.float(), None