PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show

xinference/thirdparty/cosyvoice/flow/length_regulator.py CHANGED Viewed

@@ -49,13 +49,14 @@ class InterpolateRegulator(nn.Module):
         olens = ylens
         return out * mask, olens
-    def inference(self, x1, x2, mel_len1, mel_len2):
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
         # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
         # x in (B, T, D)
         if x2.shape[1] > 40:
-            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=34, mode='linear')
-            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - 34 * 2, mode='linear')
-            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=34, mode='linear')
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
             x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
         else:
             x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')

xinference/thirdparty/cosyvoice/hifigan/discriminator.py ADDED Viewed

@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

xinference/thirdparty/cosyvoice/hifigan/generator.py CHANGED Viewed

@@ -14,7 +14,7 @@
 """HIFI-GAN"""
-import typing as tp
+from typing import Dict, Optional, List
 import numpy as np
 from scipy.signal import get_window
 import torch
@@ -38,13 +38,15 @@ This code is modified from https://github.com/jik876/hifi-gan
  https://github.com/NVIDIA/BigVGAN
 """
 class ResBlock(torch.nn.Module):
     """Residual block module in HiFiGAN/BigVGAN."""
     def __init__(
         self,
         channels: int = 512,
         kernel_size: int = 3,
-        dilations: tp.List[int] = [1, 3, 5],
+        dilations: List[int] = [1, 3, 5],
     ):
         super(ResBlock, self).__init__()
         self.convs1 = nn.ModuleList()
@@ -100,6 +102,7 @@ class ResBlock(torch.nn.Module):
             remove_weight_norm(self.convs1[idx])
             remove_weight_norm(self.convs2[idx])
 class SineGen(torch.nn.Module):
     """ Definition of sine generator
     SineGen(samp_rate, harmonic_num = 0,
@@ -231,13 +234,13 @@ class HiFTGenerator(nn.Module):
             nsf_alpha: float = 0.1,
             nsf_sigma: float = 0.003,
             nsf_voiced_threshold: float = 10,
-            upsample_rates: tp.List[int] = [8, 8],
-            upsample_kernel_sizes: tp.List[int] = [16, 16],
-            istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4},
-            resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
-            resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            source_resblock_kernel_sizes: tp.List[int] = [7, 11],
-            source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]],
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
             lrelu_slope: float = 0.1,
             audio_limit: float = 0.99,
             f0_predictor: torch.nn.Module = None,
@@ -286,8 +289,7 @@ class HiFTGenerator(nn.Module):
         self.source_resblocks = nn.ModuleList()
         downsample_rates = [1] + upsample_rates[::-1][:-1]
         downsample_cum_rates = np.cumprod(downsample_rates)
-        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes,
-                                          source_resblock_dilation_sizes)):
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
             if u == 1:
                 self.source_downs.append(
                     Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
@@ -304,7 +306,7 @@ class HiFTGenerator(nn.Module):
         self.resblocks = nn.ModuleList()
         for i in range(len(self.ups)):
             ch = base_channels // (2**(i + 1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                 self.resblocks.append(ResBlock(ch, k, d))
         self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
@@ -314,11 +316,19 @@ class HiFTGenerator(nn.Module):
         self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
         self.f0_predictor = f0_predictor
-    def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
-        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
-        har_source, _, _ = self.m_source(f0)
-        return har_source.transpose(1, 2)
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
     def _stft(self, x):
         spec = torch.stft(
@@ -332,17 +342,11 @@ class HiFTGenerator(nn.Module):
         magnitude = torch.clip(magnitude, max=1e2)
         real = magnitude * torch.cos(phase)
         img = magnitude * torch.sin(phase)
-        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
         return inverse_transform
-    def forward(self, x: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
-        f0 = self.f0_predictor(x)
-        s = self._f02source(f0)
-        # use cache_source to avoid glitch
-        if cache_source.shape[2] == 0:
-            s[:, :, :cache_source.shape[2]] = cache_source
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
         s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
         s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
@@ -374,22 +378,34 @@ class HiFTGenerator(nn.Module):
         x = self._istft(magnitude, phase)
         x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x, s
+        return x
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-        self.source_module.remove_weight_norm()
-        for l in self.source_downs:
-            remove_weight_norm(l)
-        for l in self.source_resblocks:
-            l.remove_weight_norm()
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
     @torch.inference_mode()
-    def inference(self, mel: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
-        return self.forward(x=mel, cache_source=cache_source)
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

xinference/thirdparty/cosyvoice/hifigan/hifigan.py ADDED Viewed

@@ -0,0 +1,67 @@
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from cosyvoice.utils.losses import tpr_loss, mel_loss
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}

xinference/thirdparty/cosyvoice/llm/llm.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import Dict, Optional, Callable, List, Generator
 import torch
 from torch import nn
 import torch.nn.functional as F
+from transformers import Qwen2ForCausalLM
 from torch.nn.utils.rnn import pad_sequence, unpad_sequence
 from cosyvoice.utils.common import IGNORE_ID
 from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
@@ -80,7 +81,8 @@ class TransformerLM(torch.nn.Module):
     def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len):
         text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
         speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
-        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0) for i in range(len(text_token))]
+        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0)
+                    for i in range(len(text_token))]
         lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
         lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
         return lm_input, lm_input_len
@@ -104,7 +106,8 @@ class TransformerLM(torch.nn.Module):
         embedding = batch['embedding'].to(device)
         # 1. prepare llm_target
-        lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
+        lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() +
+                                  [self.speech_token_size]) for i in range(text_token.size(0))]
         lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
         # 1. encode text_token
@@ -124,7 +127,8 @@ class TransformerLM(torch.nn.Module):
         speech_token = self.speech_embedding(speech_token)
         # 5. unpad and pad
-        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len)
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len,
+                                                         task_id_emb, speech_token, speech_token_len)
         # 6. run lm forward
         lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
@@ -194,14 +198,143 @@ class TransformerLM(torch.nn.Module):
         offset = 0
         att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device)
         for i in range(max_len):
-            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache,
-                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool))
+            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=offset, required_cache_size=-1,
+                                                                  att_cache=att_cache, cnn_cache=cnn_cache,
+                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
+                                                                                                 device=lm_input.device)).to(torch.bool))
             logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            # force continue decode first token
+            if i == 0:
+                logp[:, self.speech_token_size] = -float('inf')
             top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
             if top_ids == self.speech_token_size:
                 break
             # in stream mode, yield token one by one
-            yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
+            yield top_ids
             out_tokens.append(top_ids)
             offset += lm_input.size(1)
             lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, pretrain_path):
+        super().__init__()
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+class Qwen2LM(torch.nn.Module):
+    def __init__(
+            self,
+            llm_input_size: int,
+            llm_output_size: int,
+            speech_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        # 2. build speech token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 3,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        while True:
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+            if (not ignore_eos) or (self.speech_token_size not in top_ids):
+                break
+        return top_ids
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.model.embed_tokens(text)
+        # 2. encode embedding
+        embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        out_tokens = []
+        cache = None
+        for i in range(max_len):
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+            if top_ids == self.speech_token_size:
+                break
+            if top_ids > self.speech_token_size:
+                continue
+            # in stream mode, yield token one by one
+            yield top_ids
+            out_tokens.append(top_ids)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)

xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl