PyPI - xinference - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (132) hide show

xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py ADDED Viewed

@@ -0,0 +1,318 @@
+# MIT License
+# Copyright (c) 2022 OpenAI
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Copyright (c) [2022] [OpenAI]
+# Copyright (c) [2025] [Ziyue Jiang]
+# SPDX-License-Identifier: MIT
+# This file has been modified by Ziyue Jiang on 2025/03/19
+# Original file was released under MIT, with the full license text # available at https://github.com/openai/whisper/blob/v20240930/LICENSE.
+# This modified file is released under the same license.
+from contextlib import contextmanager
+from typing import Dict, Iterable, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn.functional import scaled_dot_product_attention
+SDPA_AVAILABLE = True
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+@contextmanager
+def disable_sdpa():
+    prev_state = MultiHeadAttention.use_sdpa
+    try:
+        MultiHeadAttention.use_sdpa = False
+        yield
+    finally:
+        MultiHeadAttention.use_sdpa = prev_state
+class MultiHeadAttention(nn.Module):
+    use_sdpa = True
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+        casual: Optional[bool] = None
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv = self.qkv_attention(q, k, v, mask, casual)
+        return self.out(wv)
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None, casual: Optional[bool] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        a = scaled_dot_product_attention(
+            q, k, v, is_causal=casual and n_ctx > 1, attn_mask=mask[:, None, None, :] if mask is not None else None
+        )
+        out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
+        return out
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = (
+            MultiHeadAttention(n_state, n_head) if cross_attention else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+        casual: Optional[bool] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache, casual=casual)
+        if self.cross_attn:
+            # TODO: Cross attention mask
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache, casual=False)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor, attn_mask: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        # assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding[:x.size(1)]).to(x.dtype)
+        for block in self.blocks:
+            x = block(x, mask=attn_mask, casual=False)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(
+        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [
+                ResidualAttentionBlock(n_state, n_head, cross_attention=True)
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        self.out_proj = nn.Linear(n_state, n_vocab)
+    def forward(self, x: Tensor, attn_mask: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = (
+            self.token_embedding(x)
+            + self.positional_embedding[offset : offset + x.shape[-1]]
+        )
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=attn_mask, kv_cache=kv_cache, casual=True)
+        x = self.ln(x)
+        # logits = (
+        #     x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
+        # ).float()
+        logits = self.out_proj(x)
+        return logits
+class Whisper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.n_vocab = 6800
+        self.n_text_layer = 6
+        self.n_text_head = 8
+        self.n_text_ctx = 2048
+        self.encoder = AudioEncoder(
+            n_mels=80, n_ctx=3000, n_state=512, n_head=8, n_layer=6,
+        )
+        self.decoder = TextDecoder(
+            n_vocab=6800, n_ctx=2048, n_state=512, n_head=8, n_layer=6,
+        )
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel, None)
+    def logits(self, tokens, audio_features, kv_cache=None):
+        return self.decoder(tokens, None, audio_features, kv_cache=kv_cache)
+    def forward(
+        self, mel, mel_len, token, token_len
+    ) -> Dict[str, torch.Tensor]:
+        attn_mask_enc = self.sequence_mask(mel_len//2, device=mel.device) > 0
+        attn_mask_dec = self.sequence_mask(token_len, device=mel.device) > 0
+        return self.decoder(token, attn_mask_dec, self.encoder(mel, attn_mask_enc))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.n_text_ctx:
+                # save as-is, for the first token or cross attention
+                cache[module] = output
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+    def sequence_mask(self, seq_lens, max_len=None, device='cpu'):
+        b = seq_lens.shape[0]
+        if max_len is None:
+            max_len = seq_lens.max()
+        mask = torch.arange(max_len).unsqueeze(0).to(device)  # [1, t]
+        mask = mask < (seq_lens.unsqueeze(1))  # [1, t] + [b, 1] = [b, t]
+        mask = mask.float()
+        return mask

xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py ADDED Viewed

@@ -0,0 +1,362 @@
+# Copyright 2025 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from copy import deepcopy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import Linear
+from tqdm import tqdm
+from tts.modules.ar_dur.commons.layers import Embedding, LayerNorm
+from tts.modules.ar_dur.commons.nar_tts_modules import PosEmb
+from tts.modules.ar_dur.commons.rot_transformer import RotTransformerDecoderLayer
+from tts.modules.ar_dur.commons.transformer import SinusoidalPositionalEmbedding
+from tts.modules.ar_dur.commons.rel_transformer import RelTransformerEncoder
+FS_ENCODERS = {
+    'rel_fft': lambda hp, dict_size: RelTransformerEncoder(
+        dict_size, hp['hidden_size'], hp['hidden_size'],
+        hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'],
+        hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']),
+}
+def fill_with_neg_inf2(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(-1e8).type_as(t)
+def expand_states(h, mel2token):
+    h = F.pad(h, [0, 0, 1, 0])
+    mel2token_ = mel2token[..., None].repeat([1, 1, h.shape[-1]])
+    h = torch.gather(h, 1, mel2token_)  # [B, T, H]
+    return h
+class CodePredictor(nn.Module):
+    def __init__(self, hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size):
+        super().__init__()
+        self.hparams = deepcopy(hparams)
+        self.hparams['hidden_size'] = hidden_size
+        self.hidden_size = hidden_size
+        char_dict_size = hparams.get('char_dict_size', 4000)
+        if not hparams.get('lm_use_enc'):
+            self.encoder = nn.Embedding(dict_size, self.hidden_size, padding_idx=0)
+            if hparams.get('mega_use_char', True):
+                self.char_encoder = nn.Embedding(char_dict_size,
+                                                 self.hidden_size, padding_idx=0)
+        else:
+            self.encoder = FS_ENCODERS[self.hparams['encoder_type']](self.hparams, dict_size)
+            if hparams.get('mega_use_char', True):
+                self.char_encoder = FS_ENCODERS[self.hparams['encoder_type']](self.hparams, char_dict_size)
+            if hparams['use_ph_pos_embed']:
+                self.ph_pos_embed = PosEmb(self.hidden_size)
+        self.char_empty_embed = nn.Embedding(1, self.hidden_size)
+        if hparams.get('use_bert_input'):
+            self.bert_input_proj = nn.Linear(768, self.hidden_size)
+        self.ling_label_embed_layers = nn.ModuleDict()
+        for k, s in zip(hparams['ling_labels'], hparams['ling_label_dict_size']):
+            self.ling_label_embed_layers[k] = Embedding(s + 3, self.hidden_size, padding_idx=0)
+        self.dec_hidden_size = dec_hidden_size
+        self.enc_proj = nn.Linear(self.hidden_size, dec_hidden_size)
+        self.code_emb = Embedding(code_size + 2, dec_hidden_size, 0)
+        self.use_pos_embed = hparams.get('use_pos_embed', False)
+        if self.use_pos_embed:
+            self.embed_positions = SinusoidalPositionalEmbedding(dec_hidden_size, 0, init_size=1024)
+        self.use_post_ln = hparams.get('use_post_ln', False)
+        self.layers = None
+        if not self.use_post_ln:
+            self.layer_norm = LayerNorm(dec_hidden_size)
+        self.code_size = code_size
+        self.project_out_dim = Linear(dec_hidden_size, code_size + 1, bias=True)
+    def forward_ling_encoder(
+            self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed, spk_id, spk_embed, mels_timbre):
+        ph_tokens = txt_tokens
+        hparams = self.hparams
+        ph_nonpadding = (ph_tokens > 0).float()[:, :, None]  # [B, T_phone, 1]
+        x_spk = self.forward_style_embed(spk_embed, spk_id, mels_timbre)
+        # enc_ph
+        if not hparams.get('lm_use_enc'):
+            x_ph = self.encoder(ph_tokens)
+            x_ph = x_ph + sum(
+                [self.ling_label_embed_layers[k](ling_feas[k]) for k in hparams['ling_labels']]) \
+                if len(hparams['ling_labels']) > 0 else 0
+            x_ph = x_ph + x_spk
+        else:
+            # enc_ph
+            ph_enc_oembed = sum(
+                [self.ling_label_embed_layers[k](ling_feas[k]) for k in hparams['ling_labels']]) \
+                if len(hparams['ling_labels']) > 0 else 0
+            ph_enc_oembed = ph_enc_oembed + self.ph_pos_embed(
+                torch.arange(0, ph_tokens.shape[1])[None,].to(ph_tokens.device))
+            ph_enc_oembed = ph_enc_oembed + x_spk
+            ph_enc_oembed = ph_enc_oembed * ph_nonpadding
+            x_ph = self.encoder(ph_tokens, other_embeds=ph_enc_oembed)
+        # enc_char
+        if char_tokens is not None and ph2char is not None:
+            char_nonpadding = (char_tokens > 0).float()[:, :, None]
+            x_char = self.char_encoder(char_tokens)
+            empty_char = (ph2char > 100000).long()
+            ph2char = ph2char * (1 - empty_char)
+            x_char_phlevel = \
+                expand_states(x_char * char_nonpadding, ph2char) \
+                * (1 - empty_char)[..., None] + \
+                self.char_empty_embed(torch.zeros_like(ph_tokens)) * empty_char[..., None]
+        else:
+            x_char_phlevel = 0
+        # x_ling
+        x_ling = x_ph + x_char_phlevel
+        x_ling = x_ling * ph_nonpadding
+        x_ling = self.enc_proj(x_ling)
+        return x_ling
+    def sample_one_step(self, vq_pred):
+        hparams = self.hparams
+        if hparams.get('infer_top_k'):
+            top_k = hparams.get('infer_top_k')
+            temperature = hparams.get('infer_temperature', 1)
+            vq_pred = vq_pred[:, -1] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(vq_pred, min(top_k, vq_pred.size(-1)))
+                vq_pred[vq_pred < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(vq_pred, dim=-1)
+            # sample from the distribution
+            vq_pred = torch.multinomial(probs, num_samples=1)
+        else:
+            vq_pred = torch.argmax(F.softmax(vq_pred[:, -1], dim=-1), 1)
+        return vq_pred
+    def forward_style_embed(self, spk_embed=None, spk_id=None, mel_ref=None):
+        # add spk embed
+        style_embed = 0
+        if self.hparams['use_spk_embed']:
+            style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :]
+        if self.hparams['use_spk_id']:
+            style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :]
+        if self.hparams['use_spk_enc']:
+            style_embed = style_embed + self.spk_enc(mel_ref)[:, None, :]
+        return style_embed
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+                not hasattr(self, '_future_mask')
+                or self._future_mask is None
+                or self._future_mask.device != tensor.device
+                or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(fill_with_neg_inf2(tensor.new(dim, dim)), 1)
+        return self._future_mask[:dim, :dim]
+class ARDurPredictor(CodePredictor):
+    def __init__(self, hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size, use_rot_embed=True,
+                 op_version=1):
+        super().__init__(hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size)
+        self.use_rot_embed = use_rot_embed
+        bias = hparams.get('lm_bias', True)
+        if self.use_rot_embed:
+            self.layers = nn.ModuleList([])
+            self.layers.extend([
+                RotTransformerDecoderLayer(
+                    dec_hidden_size, 0.0, kernel_size=1, ffn_hidden_size=dec_hidden_size * 4,
+                    post_ln=self.use_post_ln, op_version=op_version, bias=bias)
+                for _ in range(lm_num_layers)
+            ])
+        if hparams['dur_model_type'] == 'ar_mse':
+            self.project_out_dim = nn.Sequential(torch.nn.Linear(dec_hidden_size, 1), nn.Softplus())
+        else:
+            self.project_out_dim = torch.nn.Linear(dec_hidden_size, code_size + 1)
+    def forward(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
+                prev_code, spk_id=None, spk_embed=None, mels_timbre=None, mel2ph=None,
+                incremental_state=None, x_ling=None, attn_mask=None, spk_pos_ids_flat=None,
+                prompt_length=None, cache_size=20, streaming=False):
+        x = self.code_emb(prev_code)
+        if x_ling is None:
+            x_ling = self.forward_ling_encoder(
+                txt_tokens, ling_feas, char_tokens, ph2char, bert_embed, spk_id, spk_embed, mels_timbre)
+            x_ling = x_ling.flatten(0, 1)
+            txt_tokens = txt_tokens.flatten(0, 1)
+            x_ling = x_ling[txt_tokens > 0][None]
+        # run decoder
+        self_attn_padding_mask = None
+        if self.use_pos_embed:
+            positions = self.embed_positions(
+                prev_code,
+                incremental_state=incremental_state
+            )
+        if incremental_state is not None:
+            x_ling = x_ling[:, x.shape[1] - 1:x.shape[1]]
+            if spk_pos_ids_flat is not None:
+                spk_pos_ids_flat = spk_pos_ids_flat[:, x.shape[1] - 1:x.shape[1]]
+            x = x[:, -1:]
+            if self.use_pos_embed:
+                positions = positions[:, -1:]
+            if streaming:
+                # Shift Pos: query pos is min(cache_size, idx)
+                spk_pos_ids_flat = torch.min(torch.LongTensor([prompt_length + cache_size]).to(x.device),
+                                             spk_pos_ids_flat)
+        # # B x T x C -> T x B x C
+        if self.use_pos_embed:
+            x = x + positions
+        x_ling = x_ling[:, :self.hparams['max_tokens']].contiguous()
+        T = min(self.hparams.get('max_tokens_per_item', 1e9), x_ling.shape[1])
+        x_ling = x_ling.reshape(-1, T, x_ling.shape[-1])
+        x = x + x_ling
+        x = x.transpose(0, 1)
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None:
+                self_attn_mask = self.buffered_future_mask(x)
+                if attn_mask is not None:
+                    self_attn_mask = self_attn_mask + (1 - attn_mask.float()) * -1e8
+                self_attn_mask = self_attn_mask.clamp_min(-1e8)
+            else:
+                self_attn_mask = None
+            x, attn_weights = layer(
+                x,
+                incremental_state=incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                spk_pos_ids_flat=spk_pos_ids_flat
+            )
+        if streaming and incremental_state != {}:
+            for k, v in incremental_state.items():
+                if 'attn_state' in k:
+                    prev_key, prev_value = incremental_state[k]['prev_key'], incremental_state[k]['prev_value']
+                    cur_length = prev_key.shape[2]
+                    if cur_length - prompt_length > cache_size:
+                        prev_key = torch.cat((prev_key[:, :, :prompt_length], prev_key[:, :, -cache_size:]), dim=2)
+                        prev_value = torch.cat((prev_value[:, :, :prompt_length], prev_value[:, :, -cache_size:]),
+                                               dim=2)
+                    incremental_state[k]['prev_key'], incremental_state[k]['prev_value'] = prev_key, prev_value
+        if not self.use_post_ln:
+            x = self.layer_norm(x)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        x = self.project_out_dim(x)
+        return x
+    def infer(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
+              spk_id=None, spk_embed=None, mels_timbre=None,
+              incremental_state=None, ctx_vqcodes=None, spk_pos_ids_flat=None, return_state=False,
+              first_step_min=0, return_probs=False, first_decoder_inp=None, dur_disturb=0.0, **kwargs):
+        if incremental_state is None:
+            incremental_state = {}
+        x_ling = self.forward_ling_encoder(
+            txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
+            spk_id, spk_embed, mels_timbre)
+        x_ling = x_ling.flatten(0, 1)
+        txt_tokens_ori = txt_tokens
+        txt_tokens_withpad = txt_tokens = txt_tokens.flatten(0, 1)
+        x_ling = x_ling[txt_tokens > 0][None]
+        txt_tokens = txt_tokens[txt_tokens > 0][None]
+        decoded = torch.zeros_like(txt_tokens)
+        decoded = F.pad(decoded, [1, 0], value=self.code_size + 1)
+        if incremental_state != {}:
+            if first_decoder_inp is None:
+                assert ctx_vqcodes is not None
+                decoded[:, :ctx_vqcodes.shape[1]] = ctx_vqcodes
+                ctx_vqcodes = None
+            else:
+                decoded[:, :1] = first_decoder_inp
+        probs = []
+        for step in range(decoded.shape[1] - 1):
+            vq_pred = self(txt_tokens, None, None, None, None,
+                           decoded[:, :step + 1], None, None, None,
+                           incremental_state=incremental_state, x_ling=x_ling,
+                           spk_pos_ids_flat=spk_pos_ids_flat, **kwargs)
+            probs.append(vq_pred.cpu())
+            if ctx_vqcodes is None or step >= ctx_vqcodes.shape[1]:
+                if self.hparams['dur_model_type'] == 'ar_mse':
+                    d = vq_pred[:, -1, 0]
+                    if dur_disturb > 0 and step >= 1:
+                        if random.random() > 0.5:
+                            d = d * (1 + random.random() * dur_disturb)
+                        else:
+                            d = d / (1 + random.random() * dur_disturb)
+                        d = torch.clamp_max(d, self.code_size - 1)
+                    vq_pred = torch.round(d).long()
+                else:
+                    vq_pred = self.sample_one_step(vq_pred)
+                decoded[:, step + 1] = torch.clamp_min(vq_pred, 1)
+                if step == 0:
+                    decoded[:, step + 1] = torch.clamp_min(vq_pred, first_step_min)
+            else:
+                decoded[:, step + 1] = ctx_vqcodes[:, step]
+        decoded = decoded[:, 1:]
+        decoded_2d = torch.zeros_like(txt_tokens_ori)
+        decoded_2d.flatten(0, 1)[txt_tokens_withpad > 0] = decoded
+        if return_state:
+            return decoded_2d, incremental_state
+        if return_probs:
+            return decoded_2d, torch.cat(probs, 1)
+        return decoded_2d
+    def streaming_infer(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
+                        spk_id=None, spk_embed=None, mels_timbre=None,
+                        incremental_state=None, ctx_vqcodes=None, spk_pos_ids_flat=None, return_state=False,
+                        **kwargs):
+        if incremental_state is None:
+            incremental_state = {}
+        x_ling = self.forward_ling_encoder(
+            txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
+            spk_id, spk_embed, mels_timbre)
+        x_ling = x_ling.flatten(0, 1)
+        txt_tokens_ori = txt_tokens
+        txt_tokens_withpad = txt_tokens = txt_tokens.flatten(0, 1)
+        x_ling = x_ling[txt_tokens > 0][None]
+        txt_tokens = txt_tokens[txt_tokens > 0][None]
+        vq_decoded = torch.zeros_like(txt_tokens)
+        vq_decoded = F.pad(vq_decoded, [1, 0], value=self.code_size + 1)
+        if incremental_state != {}:
+            assert ctx_vqcodes is not None
+            vq_decoded[:, :ctx_vqcodes.shape[1]] = ctx_vqcodes
+            ctx_vqcodes = None
+        prompt_length = list(incremental_state.items())[0][1]['prev_key'].shape[2]
+        for step in tqdm(range(vq_decoded.shape[1] - 1), desc='AR Duration Predictor inference...'):
+            vq_pred = self(txt_tokens, None, None, None, None,
+                           vq_decoded[:, :step + 1], None, None, None,
+                           incremental_state=incremental_state, x_ling=x_ling,
+                           spk_pos_ids_flat=spk_pos_ids_flat, prompt_length=prompt_length, streaming=True, **kwargs)
+            if ctx_vqcodes is None or step >= ctx_vqcodes.shape[1]:
+                if self.hparams['dur_model_type'] == 'ar_mse':
+                    vq_pred = torch.round(vq_pred[:, -1, 0]).long()
+                else:
+                    vq_pred = self.sample_one_step(vq_pred)
+                vq_decoded[:, step + 1] = vq_pred
+            else:
+                vq_decoded[:, step + 1] = ctx_vqcodes[:, step]
+        vq_decoded = vq_decoded[:, 1:]
+        vq_decoded_2d = torch.zeros_like(txt_tokens_ori)
+        vq_decoded_2d.flatten(0, 1)[txt_tokens_withpad > 0] = vq_decoded
+        if return_state:
+            return vq_decoded_2d, incremental_state
+        return vq_decoded_2d