PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/thirdparty/cosyvoice/llm/llm.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
 from typing import Dict, Optional, Callable, List, Generator
 import torch
 from torch import nn
@@ -20,6 +21,8 @@ from torch.nn.utils.rnn import pad_sequence, unpad_sequence
 from cosyvoice.utils.common import IGNORE_ID
 from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
 from cosyvoice.utils.common import th_accuracy
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.mask import make_pad_mask
 class TransformerLM(torch.nn.Module):
@@ -144,10 +147,14 @@ class TransformerLM(torch.nn.Module):
             sampling: int,
             ignore_eos: bool = True,
     ):
+        num_trials, max_trials = 0, 100
         while True:
             top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
             if (not ignore_eos) or (self.speech_token_size not in top_ids):
                 break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
         return top_ids
     @torch.inference_mode()
@@ -178,7 +185,7 @@ class TransformerLM(torch.nn.Module):
             embedding = self.spk_embed_affine_layer(embedding)
             embedding = embedding.unsqueeze(dim=1)
         else:
-            embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+            embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device).to(text.dtype)
         # 3. concat llm_input
         sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
@@ -221,6 +228,17 @@ class Qwen2Encoder(torch.nn.Module):
         super().__init__()
         self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+    def forward(self, xs: torch.Tensor, xs_lens: torch.Tensor):
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T)
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=masks,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        return outs.hidden_states[-1], masks.unsqueeze(1)
     def forward_one_step(self, xs, masks, cache=None):
         input_masks = masks[:, -1, :]
         outs = self.model(
@@ -236,7 +254,7 @@ class Qwen2Encoder(torch.nn.Module):
         return xs, new_cache
-class Qwen2LM(torch.nn.Module):
+class Qwen2LM(TransformerLM):
     def __init__(
             self,
             llm_input_size: int,
@@ -246,8 +264,9 @@ class Qwen2LM(torch.nn.Module):
             sampling: Callable,
             length_normalized_loss: bool = True,
             lsm_weight: float = 0.0,
+            mix_ratio: List[int] = [5, 15],
     ):
-        super().__init__()
+        torch.nn.Module.__init__(self)
         self.llm_input_size = llm_input_size
         self.llm_output_size = llm_output_size
         self.speech_token_size = speech_token_size
@@ -272,19 +291,83 @@ class Qwen2LM(torch.nn.Module):
         # 4. sampling method
         self.sampling = sampling
+        self.mix_ratio = mix_ratio
-    def sampling_ids(
+    def prepare_lm_input_target(self, text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len):
+        lm_target, lm_input = [], []
+        text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
+        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
+        text_token_emb = unpad_sequence(text_token_emb, text_token_len.cpu(), batch_first=True)
+        speech_token_emb = unpad_sequence(speech_token_emb, speech_token_len.cpu(), batch_first=True)
+        for i in range(len(text_token)):
+            # bistream sequence
+            if random.random() < 0.5 and speech_token_len[i] / text_token_len[i] > self.mix_ratio[1] / self.mix_ratio[0]:
+                this_lm_target, this_lm_input = [], []
+                this_lm_target.append(IGNORE_ID)
+                this_lm_input.append(self.llm_embedding.weight[self.sos_eos].reshape(1, -1))
+                for j in range(((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()):
+                    this_text_token = text_token[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]].tolist()
+                    this_speech_token = speech_token[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]].tolist()
+                    if len(this_text_token) == self.mix_ratio[0]:
+                        assert len(this_speech_token) == self.mix_ratio[1]
+                        this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
+                        this_lm_target += this_speech_token
+                        this_lm_target.append(self.speech_token_size + 2)
+                        this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]])
+                        this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]])
+                    else:
+                        this_lm_target += [-1] * len(this_text_token)
+                        this_lm_target += speech_token[i][j * self.mix_ratio[1]:].tolist()
+                        this_lm_target.append(self.speech_token_size)
+                        this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]:])
+                        this_lm_input.append(self.llm_embedding.weight[self.task_id].reshape(1, -1))
+                        this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]:])
+                this_lm_target, this_lm_input = torch.tensor(this_lm_target), torch.concat(this_lm_input, dim=0)
+            # unistream sequence
+            else:
+                this_lm_target = torch.tensor([IGNORE_ID] * (1 + text_token_len[i]) + speech_token[i].tolist() + [self.speech_token_size])
+                this_lm_input = torch.concat([self.llm_embedding.weight[self.sos_eos].reshape(1, -1), text_token_emb[i],
+                                              self.llm_embedding.weight[self.task_id].reshape(1, -1), speech_token_emb[i]], dim=0)
+            lm_target.append(this_lm_target)
+            lm_input.append(this_lm_input)
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID)
+        return lm_target, lm_input, lm_input_len
+    def forward(
             self,
-            weighted_scores: torch.Tensor,
-            decoded_tokens: List,
-            sampling: int,
-            ignore_eos: bool = True,
-    ):
-        while True:
-            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
-            if (not ignore_eos) or (self.speech_token_size not in top_ids):
-                break
-        return top_ids
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        # 1. encode text_token
+        text_token_emb = self.llm.model.model.embed_tokens(text_token)
+        # 2. encode speech_token
+        speech_token_emb = self.speech_embedding(speech_token)
+        # 3. prepare llm_input/target
+        lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len)
+        lm_target = lm_target.to(device)
+        # 4. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target.to(device))
+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
     @torch.inference_mode()
     def inference(
@@ -305,9 +388,6 @@ class Qwen2LM(torch.nn.Module):
         text_len += prompt_text_len
         text = self.llm.model.model.embed_tokens(text)
-        # 2. encode embedding
-        embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
         # 3. concat llm_input
         sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
         task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
@@ -315,7 +395,7 @@ class Qwen2LM(torch.nn.Module):
             prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
         else:
             prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
-        lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        lm_input = torch.concat([sos_eos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1)
         # 4. cal min/max_length
         min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
@@ -338,3 +418,103 @@ class Qwen2LM(torch.nn.Module):
             yield top_ids
             out_tokens.append(top_ids)
             lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+    @torch.inference_mode()
+    def inference_bistream(
+            self,
+            text: Generator,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = prompt_text.device
+        # 1. prepare input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb], dim=1)
+        # 2. iterate text
+        out_tokens = []
+        cache = None
+        # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
+        text_cache = self.llm.model.model.embed_tokens(prompt_text)
+        next_fill_index = -1
+        for this_text in text:
+            text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
+            # prompt_speech_token_emb not empty, try append to lm_input
+            while prompt_speech_token_emb.size(1) != 0:
+                if text_cache.size(1) >= self.mix_ratio[0]:
+                    lm_input_text, lm_input_speech = text_cache[:, :self.mix_ratio[0]], prompt_speech_token_emb[:, :self.mix_ratio[1]]
+                    logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1)))
+                    lm_input = torch.concat([lm_input, lm_input_text, lm_input_speech], dim=1)
+                    text_cache, prompt_speech_token_emb = text_cache[:, self.mix_ratio[0]:], prompt_speech_token_emb[:, self.mix_ratio[1]:]
+                else:
+                    logging.info('not enough text token to decode, wait for more')
+                    break
+            # no prompt_speech_token_emb remain, can decode some speech token
+            if prompt_speech_token_emb.size(1) == 0:
+                if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == 1):
+                    logging.info('get fill token, need to append more text token')
+                    if text_cache.size(1) >= self.mix_ratio[0]:
+                        lm_input_text = text_cache[:, :self.mix_ratio[0]]
+                        logging.info('append {} text token'.format(lm_input_text.size(1)))
+                        if len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
+                            lm_input = lm_input_text
+                        else:
+                            lm_input = torch.concat([lm_input, lm_input_text], dim=1)
+                        text_cache = text_cache[:, self.mix_ratio[0]:]
+                    else:
+                        logging.info('not enough text token to decode, wait for more')
+                        continue
+                while True:
+                    seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+                    y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                              masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                              cache=cache)
+                    logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                    if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                        top_ids = self.speech_token_size + 2
+                        next_fill_index += (self.mix_ratio[1] + 1)
+                    else:
+                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
+                    if top_ids == self.speech_token_size + 2:
+                        next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
+                        logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
+                    out_tokens.append(top_ids)
+                    if top_ids >= self.speech_token_size:
+                        if top_ids == self.speech_token_size + 2:
+                            break
+                        else:
+                            raise ValueError('should not get token {}'.format(top_ids))
+                    yield top_ids
+                    lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        # 3. final decode
+        lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
+        logging.info('no more text token, decode until met eos')
+        while True:
+            seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False).item()
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.speech_token_size:
+                    break
+                else:
+                    raise ValueError('should not get token {}'.format(top_ids))
+            # in stream mode, yield token one by one
+            yield top_ids
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)

xinference/thirdparty/cosyvoice/transformer/embedding.py CHANGED Viewed

@@ -287,8 +287,16 @@ class EspnetRelPositionalEncoding(torch.nn.Module):
         Returns:
             torch.Tensor: Corresponding encoding
         """
-        pos_emb = self.pe[
-            :,
-            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
-        ]
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
+        elif isinstance(offset, torch.Tensor):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
         return pos_emb

xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py CHANGED Viewed

@@ -56,11 +56,16 @@ class Upsample1D(nn.Module):
         # In this mode, first repeat interpolate, than conv with stride=1
         self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
-    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor, conv_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
-        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        if conv_cache.size(2) == 0:
+            outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        else:
+            assert conv_cache.size(2) == self.stride * 2
+            outputs = torch.concat([conv_cache, outputs], dim=2)
+        conv_cache_new = outputs[:, :, -self.stride * 2:]
         outputs = self.conv(outputs)
-        return outputs, input_lengths * self.stride
+        return outputs, input_lengths * self.stride, conv_cache_new
 class PreLookaheadLayer(nn.Module):
@@ -78,22 +83,32 @@ class PreLookaheadLayer(nn.Module):
             kernel_size=3, stride=1, padding=0,
         )
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+    def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0), conv2_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         inputs: (batch_size, seq_len, channels)
         """
         outputs = inputs.transpose(1, 2).contiguous()
+        context = context.transpose(1, 2).contiguous()
         # look ahead
-        outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        if context.size(2) == 0:
+            outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        else:
+            assert context.size(2) == self.pre_lookahead_len
+            outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
         outputs = F.leaky_relu(self.conv1(outputs))
         # outputs
-        outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+        if conv2_cache.size(2) == 0:
+            outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
+        else:
+            assert conv2_cache.size(2) == self.conv2.kernel_size[0] - 1
+            outputs = torch.concat([conv2_cache, outputs], dim=2)
+        conv2_cache_new = outputs[:, :, -(self.conv2.kernel_size[0] - 1):]
         outputs = self.conv2(outputs)
         outputs = outputs.transpose(1, 2).contiguous()
         # residual connection
         outputs = outputs + inputs
-        return outputs
+        return outputs, conv2_cache_new
 class UpsampleConformerEncoder(torch.nn.Module):
@@ -240,6 +255,7 @@ class UpsampleConformerEncoder(torch.nn.Module):
         xs_lens: torch.Tensor,
         decoding_chunk_size: int = 0,
         num_decoding_left_chunks: int = -1,
+        streaming: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Embed positions in tensor.
@@ -270,30 +286,20 @@ class UpsampleConformerEncoder(torch.nn.Module):
             xs = self.global_cmvn(xs)
         xs, pos_emb, masks = self.embed(xs, masks)
         mask_pad = masks  # (B, 1, T/subsample_rate)
-        chunk_masks = add_optional_chunk_mask(xs, masks,
-                                              self.use_dynamic_chunk,
-                                              self.use_dynamic_left_chunk,
-                                              decoding_chunk_size,
-                                              self.static_chunk_size,
-                                              num_decoding_left_chunks)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
         # lookahead + conformer encoder
-        xs = self.pre_lookahead_layer(xs)
+        xs, _ = self.pre_lookahead_layer(xs)
         xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
         # upsample + conformer encoder
         xs = xs.transpose(1, 2).contiguous()
-        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs, xs_lens, _ = self.up_layer(xs, xs_lens)
         xs = xs.transpose(1, 2).contiguous()
         T = xs.size(1)
         masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
         xs, pos_emb, masks = self.up_embed(xs, masks)
         mask_pad = masks  # (B, 1, T/subsample_rate)
-        chunk_masks = add_optional_chunk_mask(xs, masks,
-                                              self.use_dynamic_chunk,
-                                              self.use_dynamic_left_chunk,
-                                              decoding_chunk_size,
-                                              self.static_chunk_size * self.up_layer.stride,
-                                              num_decoding_left_chunks)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size * self.up_layer.stride if streaming is True else 0, -1)
         xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
         if self.normalize_before:
@@ -316,3 +322,100 @@ class UpsampleConformerEncoder(torch.nn.Module):
         for layer in self.up_encoders:
             xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
         return xs
+    @torch.jit.export
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        offset: int = 0,
+        context: torch.Tensor = torch.zeros(0, 0, 0),
+        pre_lookahead_layer_conv2_cache: torch.Tensor = torch.zeros(0, 0, 0),
+        encoders_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0),
+        upsample_offset: int = 0,
+        upsample_conv_cache: torch.Tensor = torch.zeros(0, 0, 0),
+        upsample_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0)
+    ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[int, torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        offset += xs.size(1)
+        tmp_masks = torch.ones(1,
+                               context.size(1),
+                               device=context.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if context.size(1) != 0:
+            context, _, _ = self.embed(context, tmp_masks, offset)
+        # lookahead + conformer encoder
+        xs, pre_lookahead_layer_conv2_cache = self.pre_lookahead_layer(xs, context, pre_lookahead_layer_conv2_cache)
+        # NOTE in cache mode we do not need to call add_optional_chunk_mask
+        chunk_masks = torch.ones((1, xs.size(1), offset), dtype=torch.bool, device=xs.device)
+        mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
+        encoders_kv_cache_list = []
+        for index, layer in enumerate(self.encoders):
+            xs, chunk_masks, encoders_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, encoders_kv_cache[index])
+            encoders_kv_cache_list.append(encoders_kv_cache_new)
+        encoders_kv_cache = torch.stack(encoders_kv_cache_list, dim=0)
+        # upsample
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens, upsample_conv_cache = self.up_layer(xs, xs_lens, upsample_conv_cache)
+        xs = xs.transpose(1, 2).contiguous()
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        xs, pos_emb, masks = self.up_embed(xs, tmp_masks, upsample_offset)
+        upsample_offset += xs.size(1)
+        # conformer encoder
+        chunk_masks = torch.ones((1, xs.size(1), upsample_offset), dtype=torch.bool, device=xs.device)
+        mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
+        upsample_kv_cache_list = []
+        for index, layer in enumerate(self.up_encoders):
+            xs, chunk_masks, upsample_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, upsample_kv_cache[index])
+            upsample_kv_cache_list.append(upsample_kv_cache_new)
+        upsample_kv_cache = torch.stack(upsample_kv_cache_list, dim=0)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache, upsample_offset, upsample_conv_cache, upsample_kv_cache)

xinference/thirdparty/cosyvoice/utils/class_utils.py CHANGED Viewed

@@ -32,6 +32,10 @@ from cosyvoice.transformer.attention import (MultiHeadedAttention,
                                              RelPositionMultiHeadedAttention)
 from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
 from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
+from cosyvoice.llm.llm import TransformerLM, Qwen2LM
+from cosyvoice.flow.flow import MaskedDiffWithXvec, CausalMaskedDiffWithXvec
+from cosyvoice.hifigan.generator import HiFTGenerator
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 COSYVOICE_ACTIVATION_CLASSES = {
@@ -68,3 +72,12 @@ COSYVOICE_ATTENTION_CLASSES = {
     "selfattn": MultiHeadedAttention,
     "rel_selfattn": RelPositionMultiHeadedAttention,
 }
+def get_model_type(configs):
+    # NOTE CosyVoice2Model inherits CosyVoiceModel
+    if isinstance(configs['llm'], TransformerLM) and isinstance(configs['flow'], MaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoiceModel
+    if isinstance(configs['llm'], Qwen2LM) and isinstance(configs['flow'], CausalMaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoice2Model
+    raise TypeError('No valid model type found!')

xinference/thirdparty/cosyvoice/utils/common.py CHANGED Viewed

@@ -162,5 +162,5 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     # attention mask bias
     # NOTE(Mddct): torch.finfo jit issues
     #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
-    mask = (1.0 - mask) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
     return mask

xinference/thirdparty/cosyvoice/utils/file_utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
-#               2024 Alibaba Inc (authors: Xiang Lyu)
+#               2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,9 +39,47 @@ def read_json_lists(list_file):
 def load_wav(wav, target_sr):
-    speech, sample_rate = torchaudio.load(wav)
+    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
     speech = speech.mean(dim=0, keepdim=True)
     if sample_rate != target_sr:
         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
     return speech
+def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
+    import tensorrt as trt
+    logging.info("Converting onnx to trt...")
+    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    logger = trt.Logger(trt.Logger.INFO)
+    builder = trt.Builder(logger)
+    network = builder.create_network(network_flags)
+    parser = trt.OnnxParser(network, logger)
+    config = builder.create_builder_config()
+    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 33)  # 8GB
+    if fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+    profile = builder.create_optimization_profile()
+    # load onnx model
+    with open(onnx_model, "rb") as f:
+        if not parser.parse(f.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            raise ValueError('failed to parse {}'.format(onnx_model))
+    # set input shapes
+    for i in range(len(trt_kwargs['input_names'])):
+        profile.set_shape(trt_kwargs['input_names'][i], trt_kwargs['min_shape'][i], trt_kwargs['opt_shape'][i], trt_kwargs['max_shape'][i])
+    tensor_dtype = trt.DataType.HALF if fp16 else trt.DataType.FLOAT
+    # set input and output data type
+    for i in range(network.num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.dtype = tensor_dtype
+    for i in range(network.num_outputs):
+        output_tensor = network.get_output(i)
+        output_tensor.dtype = tensor_dtype
+    config.add_optimization_profile(profile)
+    engine_bytes = builder.build_serialized_network(network, config)
+    # save trt engine
+    with open(trt_model, "wb") as f:
+        f.write(engine_bytes)
+    logging.info("Succesfully convert onnx to trt...")

xinference/thirdparty/cosyvoice/utils/frontend_utils.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import re
+import regex
 chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
@@ -127,3 +128,9 @@ def replace_blank(text: str):
         else:
             out_str.append(c)
     return "".join(out_str)
+def is_only_punctuation(text):
+    # Regular expression: Match strings that consist only of punctuation marks or are empty.
+    punctuation_pattern = r'^[\p{P}\p{S}]*$'
+    return bool(regex.fullmatch(punctuation_pattern, text))

xinference/thirdparty/cosyvoice/utils/mask.py CHANGED Viewed

@@ -195,6 +195,10 @@ def add_optional_chunk_mask(xs: torch.Tensor,
         chunk_masks = masks & chunk_masks  # (B, L, L)
     else:
         chunk_masks = masks
+    assert chunk_masks.dtype == torch.bool
+    if (chunk_masks.sum(dim=-1) == 0).sum().item() != 0:
+        print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
+        chunk_masks[chunk_masks.sum(dim=-1) == 0] = True
     return chunk_masks

xinference/thirdparty/cosyvoice/utils/train_utils.py CHANGED Viewed

@@ -286,11 +286,15 @@ def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict):
             # optimizer.step().
             if torch.isfinite(grad_norm):
                 scaler.step(optimizer)
+            else:
+                logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
             scaler.update()
         else:
             grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
             if torch.isfinite(grad_norm):
                 optimizer.step()
+            else:
+                logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
         optimizer.zero_grad()
         scheduler.step()
     info_dict["lr"] = optimizer.param_groups[0]['lr']
@@ -336,7 +340,7 @@ def log_per_save(writer, info_dict):
     rank = int(os.environ.get('RANK', 0))
     logging.info(
         'Epoch {} Step {} CV info lr {} {} rank {}'.format(
-            epoch, step + 1, lr, rank, ' '.join(['{}_{}'.format(k, v) for k, v in loss_dict.items()])))
+            epoch, step + 1, lr, rank, ' '.join(['{} {}'.format(k, v) for k, v in loss_dict.items()])))
     if writer is not None:
         for k in ['epoch', 'lr']:

xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl