PyPI - xinference - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show

xinference/thirdparty/megatts3/tts/infer_cli.py ADDED Viewed

@@ -0,0 +1,277 @@
+# Copyright 2025 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import argparse
+import librosa
+import numpy as np
+import torch
+from tn.chinese.normalizer import Normalizer as ZhNormalizer
+from tn.english.normalizer import Normalizer as EnNormalizer
+from langdetect import detect as classify_language
+from pydub import AudioSegment
+import pyloudnorm as pyln
+from tts.modules.ar_dur.commons.nar_tts_modules import LengthRegulator
+from tts.frontend_function import g2p, align, make_dur_prompt, dur_pred, prepare_inputs_for_dit
+from tts.utils.audio_utils.io import save_wav, to_wav_bytes, convert_to_wav_bytes, combine_audio_segments
+from tts.utils.commons.ckpt_utils import load_ckpt
+from tts.utils.commons.hparams import set_hparams, hparams
+from tts.utils.text_utils.text_encoder import TokenTextEncoder
+from tts.utils.text_utils.split_text import chunk_text_chinese, chunk_text_english
+from tts.utils.commons.hparams import hparams, set_hparams
+if "TOKENIZERS_PARALLELISM" not in os.environ:
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def convert_to_wav(wav_path):
+    # Check if the file exists
+    if not os.path.exists(wav_path):
+        print(f"The file '{wav_path}' does not exist.")
+        return
+    # Check if the file already has a .wav extension
+    if not wav_path.endswith(".wav"):
+        # Define the output path with a .wav extension
+        out_path = os.path.splitext(wav_path)[0] + ".wav"
+        # Load the audio file using pydub and convert it to WAV
+        audio = AudioSegment.from_file(wav_path)
+        audio.export(out_path, format="wav")
+        print(f"Converted '{wav_path}' to '{out_path}'")
+def cut_wav(wav_path, max_len=28):
+    audio = AudioSegment.from_file(wav_path)
+    audio = audio[:int(max_len * 1000)]
+    audio.export(wav_path, format="wav")
+class MegaTTS3DiTInfer():
+    def __init__(
+            self,
+            device=None,
+            ckpt_root='./checkpoints',
+            dit_exp_name='diffusion_transformer',
+            frontend_exp_name='aligner_lm',
+            wavvae_exp_name='wavvae',
+            dur_ckpt_path='duration_lm',
+            g2p_exp_name='g2p',
+            precision=torch.float16,
+            **kwargs
+        ):
+        self.sr = 24000
+        self.fm = 8
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        self.precision = precision
+        # build models
+        self.dit_exp_name = os.path.join(ckpt_root, dit_exp_name)
+        self.frontend_exp_name = os.path.join(ckpt_root, frontend_exp_name)
+        self.wavvae_exp_name = os.path.join(ckpt_root, wavvae_exp_name)
+        self.dur_exp_name = os.path.join(ckpt_root, dur_ckpt_path)
+        self.g2p_exp_name = os.path.join(ckpt_root, g2p_exp_name)
+        self.build_model(self.device)
+        # init text normalizer
+        self.zh_normalizer = ZhNormalizer(overwrite_cache=False, remove_erhua=False, remove_interjections=False)
+        self.en_normalizer = EnNormalizer(overwrite_cache=False)
+        # loudness meter
+        self.loudness_meter = pyln.Meter(self.sr)
+    def build_model(self, device):
+        set_hparams(exp_name=self.dit_exp_name, print_hparams=False)
+        ''' Load Dict '''
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        ling_dict = json.load(open(f"{current_dir}/utils/text_utils/dict.json", encoding='utf-8-sig'))
+        self.ling_dict = {k: TokenTextEncoder(None, vocab_list=ling_dict[k], replace_oov='<UNK>') for k in ['phone', 'tone']}
+        self.token_encoder = token_encoder = self.ling_dict['phone']
+        ph_dict_size = len(token_encoder)
+        ''' Load Duration LM '''
+        from tts.modules.ar_dur.ar_dur_predictor import ARDurPredictor
+        hp_dur_model = self.hp_dur_model = set_hparams(f'{self.dur_exp_name}/config.yaml', global_hparams=False)
+        hp_dur_model['frames_multiple'] = hparams['frames_multiple']
+        self.dur_model = ARDurPredictor(
+            hp_dur_model, hp_dur_model['dur_txt_hs'], hp_dur_model['dur_model_hidden_size'],
+            hp_dur_model['dur_model_layers'], ph_dict_size,
+            hp_dur_model['dur_code_size'],
+            use_rot_embed=hp_dur_model.get('use_rot_embed', False))
+        self.length_regulator = LengthRegulator()
+        load_ckpt(self.dur_model, f'{self.dur_exp_name}', 'dur_model')
+        self.dur_model.eval()
+        self.dur_model.to(device)
+        ''' Load Diffusion Transformer '''
+        from tts.modules.llm_dit.dit import Diffusion
+        self.dit = Diffusion()
+        load_ckpt(self.dit, f'{self.dit_exp_name}', 'dit', strict=False)
+        self.dit.eval()
+        self.dit.to(device)
+        self.cfg_mask_token_phone = 302 - 1
+        self.cfg_mask_token_tone = 32 - 1
+        ''' Load Frontend LM '''
+        from tts.modules.aligner.whisper_small import Whisper
+        self.aligner_lm = Whisper()
+        load_ckpt(self.aligner_lm, f'{self.frontend_exp_name}', 'model')
+        self.aligner_lm.eval()
+        self.aligner_lm.to(device)
+        self.kv_cache = None
+        self.hooks = None
+        ''' Load G2P LM'''
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        g2p_tokenizer = AutoTokenizer.from_pretrained(self.g2p_exp_name, padding_side="right")
+        g2p_tokenizer.padding_side = "right"
+        self.g2p_model = AutoModelForCausalLM.from_pretrained(self.g2p_exp_name).eval().to(device)
+        self.g2p_tokenizer = g2p_tokenizer
+        self.speech_start_idx = g2p_tokenizer.encode('<Reserved_TTS_0>')[0]
+        ''' Wav VAE '''
+        self.hp_wavvae = hp_wavvae = set_hparams(f'{self.wavvae_exp_name}/config.yaml', global_hparams=False)
+        from tts.modules.wavvae.decoder.wavvae_v3 import WavVAE_V3
+        self.wavvae = WavVAE_V3(hparams=hp_wavvae)
+        if os.path.exists(f'{self.wavvae_exp_name}/model_only_last.ckpt'):
+            load_ckpt(self.wavvae, f'{self.wavvae_exp_name}/model_only_last.ckpt', 'model_gen', strict=True)
+            self.has_vae_encoder = True
+        else:
+            load_ckpt(self.wavvae, f'{self.wavvae_exp_name}/decoder.ckpt', 'model_gen', strict=False)
+            self.has_vae_encoder = False
+        self.wavvae.eval()
+        self.wavvae.to(device)
+        self.vae_stride = hp_wavvae.get('vae_stride', 4)
+        self.hop_size = hp_wavvae.get('hop_size', 4)
+    def preprocess(self, audio_bytes, latent_file=None, topk_dur=1, **kwargs):
+        wav_bytes = convert_to_wav_bytes(audio_bytes)
+        ''' Load wav '''
+        wav, _ = librosa.core.load(wav_bytes, sr=self.sr)
+        # Pad wav if necessary
+        ws = hparams['win_size']
+        if len(wav) % ws < ws - 1:
+            wav = np.pad(wav, (0, ws - 1 - (len(wav) % ws)), mode='constant', constant_values=0.0).astype(np.float32)
+        wav = np.pad(wav, (0, 12000), mode='constant', constant_values=0.0).astype(np.float32)
+        self.loudness_prompt = self.loudness_meter.integrated_loudness(wav.astype(float))
+        ''' obtain alignments with aligner_lm '''
+        ph_ref, tone_ref, mel2ph_ref = align(self, wav)
+        with torch.inference_mode():
+            ''' Forward WaveVAE to obtain: prompt latent '''
+            if self.has_vae_encoder:
+                wav = torch.FloatTensor(wav)[None].to(self.device)
+                vae_latent = self.wavvae.encode_latent(wav)
+                vae_latent = vae_latent[:, :mel2ph_ref.size(1)//4]
+            else:
+                assert latent_file is not None, "Please provide latent_file in WaveVAE decoder-only mode"
+                vae_latent = torch.from_numpy(np.load(latent_file)).to(self.device)
+                vae_latent = vae_latent[:, :mel2ph_ref.size(1)//4]
+            ''' Duration Prompting '''
+            self.dur_model.hparams["infer_top_k"] = topk_dur if topk_dur > 1 else None
+            incremental_state_dur_prompt, ctx_dur_tokens = make_dur_prompt(self, mel2ph_ref, ph_ref, tone_ref)
+        return {
+            'ph_ref': ph_ref,
+            'tone_ref': tone_ref,
+            'mel2ph_ref': mel2ph_ref,
+            'vae_latent': vae_latent,
+            'incremental_state_dur_prompt': incremental_state_dur_prompt,
+            'ctx_dur_tokens': ctx_dur_tokens,
+        }
+    def forward(self, resource_context, input_text, time_step, p_w, t_w, dur_disturb=0.1, dur_alpha=1.0, **kwargs):
+        device = self.device
+        ph_ref = resource_context['ph_ref'].to(device)
+        tone_ref = resource_context['tone_ref'].to(device)
+        mel2ph_ref = resource_context['mel2ph_ref'].to(device)
+        vae_latent = resource_context['vae_latent'].to(device)
+        ctx_dur_tokens = resource_context['ctx_dur_tokens'].to(device)
+        incremental_state_dur_prompt = resource_context['incremental_state_dur_prompt']
+        with torch.inference_mode():
+            ''' Generating '''
+            wav_pred_ = []
+            language_type = classify_language(input_text)
+            if language_type == 'en':
+                input_text = self.en_normalizer.normalize(input_text)
+                text_segs = chunk_text_english(input_text, max_chars=130)
+            else:
+                input_text = self.zh_normalizer.normalize(input_text)
+                text_segs = chunk_text_chinese(input_text, limit=60)
+            for seg_i, text in enumerate(text_segs):
+                ''' G2P '''
+                ph_pred, tone_pred = g2p(self, text)
+                ''' Duration Prediction '''
+                mel2ph_pred = dur_pred(self, ctx_dur_tokens, incremental_state_dur_prompt, ph_pred, tone_pred, seg_i, dur_disturb, dur_alpha, is_first=seg_i==0, is_final=seg_i==len(text_segs)-1)
+                inputs = prepare_inputs_for_dit(self, mel2ph_ref, mel2ph_pred, ph_ref, tone_ref, ph_pred, tone_pred, vae_latent)
+                # Speech dit inference
+                with torch.cuda.amp.autocast(dtype=self.precision, enabled=True):
+                    x = self.dit.inference(inputs, timesteps=time_step, seq_cfg_w=[p_w, t_w]).float()
+                # WavVAE decode
+                x[:, :vae_latent.size(1)] = vae_latent
+                wav_pred = self.wavvae.decode(x)[0,0].to(torch.float32)
+                ''' Post-processing '''
+                # Trim prompt wav
+                wav_pred = wav_pred[vae_latent.size(1)*self.vae_stride*self.hop_size:].cpu().numpy()
+                # Norm generated wav to prompt wav's level
+                meter = pyln.Meter(self.sr)  # create BS.1770 meter
+                loudness_pred = self.loudness_meter.integrated_loudness(wav_pred.astype(float))
+                wav_pred = pyln.normalize.loudness(wav_pred, loudness_pred, self.loudness_prompt)
+                if np.abs(wav_pred).max() >= 1:
+                    wav_pred = wav_pred / np.abs(wav_pred).max() * 0.95
+                # Apply hamming window
+                wav_pred_.append(wav_pred)
+            return combine_audio_segments(wav_pred_, sr=self.sr).astype(float)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_wav', type=str)
+    parser.add_argument('--input_text', type=str)
+    parser.add_argument('--output_dir', type=str)
+    parser.add_argument('--time_step', type=int, default=32, help='Inference steps of Diffusion Transformer')
+    parser.add_argument('--p_w', type=float, default=1.6, help='Intelligibility Weight')
+    parser.add_argument('--t_w', type=float, default=2.5, help='Similarity Weight')
+    args = parser.parse_args()
+    wav_path, input_text, out_path, time_step, p_w, t_w = args.input_wav, args.input_text, args.output_dir, args.time_step, args.p_w, args.t_w
+    infer_ins = MegaTTS3DiTInfer()
+    with open(wav_path, 'rb') as file:
+        file_content = file.read()
+    print(f"| Start processing {wav_path}+{input_text}")
+    resource_context = infer_ins.preprocess(file_content, latent_file=wav_path.replace('.wav', '.npy'))
+    wav_bytes = infer_ins.forward(resource_context, input_text, time_step=time_step, p_w=p_w, t_w=t_w)
+    print(f"| Saving results to {out_path}/[P]{input_text[:20]}.wav")
+    os.makedirs(out_path, exist_ok=True)
+    save_wav(wav_bytes, f'{out_path}/[P]{input_text[:20]}.wav')

xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py ADDED Viewed

@@ -0,0 +1,318 @@
+# MIT License
+# Copyright (c) 2022 OpenAI
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Copyright (c) [2022] [OpenAI]
+# Copyright (c) [2025] [Ziyue Jiang]
+# SPDX-License-Identifier: MIT
+# This file has been modified by Ziyue Jiang on 2025/03/19
+# Original file was released under MIT, with the full license text # available at https://github.com/openai/whisper/blob/v20240930/LICENSE.
+# This modified file is released under the same license.
+from contextlib import contextmanager
+from typing import Dict, Iterable, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn.functional import scaled_dot_product_attention
+SDPA_AVAILABLE = True
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+@contextmanager
+def disable_sdpa():
+    prev_state = MultiHeadAttention.use_sdpa
+    try:
+        MultiHeadAttention.use_sdpa = False
+        yield
+    finally:
+        MultiHeadAttention.use_sdpa = prev_state
+class MultiHeadAttention(nn.Module):
+    use_sdpa = True
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+        casual: Optional[bool] = None
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv = self.qkv_attention(q, k, v, mask, casual)
+        return self.out(wv)
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None, casual: Optional[bool] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        a = scaled_dot_product_attention(
+            q, k, v, is_causal=casual and n_ctx > 1, attn_mask=mask[:, None, None, :] if mask is not None else None
+        )
+        out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
+        return out
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = (
+            MultiHeadAttention(n_state, n_head) if cross_attention else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+        casual: Optional[bool] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache, casual=casual)
+        if self.cross_attn:
+            # TODO: Cross attention mask
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache, casual=False)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor, attn_mask: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        # assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding[:x.size(1)]).to(x.dtype)
+        for block in self.blocks:
+            x = block(x, mask=attn_mask, casual=False)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(
+        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [
+                ResidualAttentionBlock(n_state, n_head, cross_attention=True)
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        self.out_proj = nn.Linear(n_state, n_vocab)
+    def forward(self, x: Tensor, attn_mask: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = (
+            self.token_embedding(x)
+            + self.positional_embedding[offset : offset + x.shape[-1]]
+        )
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=attn_mask, kv_cache=kv_cache, casual=True)
+        x = self.ln(x)
+        # logits = (
+        #     x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
+        # ).float()
+        logits = self.out_proj(x)
+        return logits
+class Whisper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.n_vocab = 6800
+        self.n_text_layer = 6
+        self.n_text_head = 8
+        self.n_text_ctx = 2048
+        self.encoder = AudioEncoder(
+            n_mels=80, n_ctx=3000, n_state=512, n_head=8, n_layer=6,
+        )
+        self.decoder = TextDecoder(
+            n_vocab=6800, n_ctx=2048, n_state=512, n_head=8, n_layer=6,
+        )
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel, None)
+    def logits(self, tokens, audio_features, kv_cache=None):
+        return self.decoder(tokens, None, audio_features, kv_cache=kv_cache)
+    def forward(
+        self, mel, mel_len, token, token_len
+    ) -> Dict[str, torch.Tensor]:
+        attn_mask_enc = self.sequence_mask(mel_len//2, device=mel.device) > 0
+        attn_mask_dec = self.sequence_mask(token_len, device=mel.device) > 0
+        return self.decoder(token, attn_mask_dec, self.encoder(mel, attn_mask_enc))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.n_text_ctx:
+                # save as-is, for the first token or cross attention
+                cache[module] = output
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+    def sequence_mask(self, seq_lens, max_len=None, device='cpu'):
+        b = seq_lens.shape[0]
+        if max_len is None:
+            max_len = seq_lens.max()
+        mask = torch.arange(max_len).unsqueeze(0).to(device)  # [1, t]
+        mask = mask < (seq_lens.unsqueeze(1))  # [1, t] + [b, 1] = [b, t]
+        mask = mask.float()
+        return mask