PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/thirdparty/cosyvoice/cli/frontend.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import partial
+from typing import Generator
 import json
 import onnxruntime
 import torch
@@ -31,7 +32,8 @@ except ImportError:
     from tn.chinese.normalizer import Normalizer as ZhNormalizer
     from tn.english.normalizer import Normalizer as EnNormalizer
     use_ttsfrd = False
-from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
 class CosyVoiceFrontEnd:
@@ -42,7 +44,6 @@ class CosyVoiceFrontEnd:
                  campplus_model: str,
                  speech_tokenizer_model: str,
                  spk2info: str = '',
-                 instruct: bool = False,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
@@ -58,9 +59,7 @@ class CosyVoiceFrontEnd:
             self.spk2info = torch.load(spk2info, map_location=self.device)
         else:
             self.spk2info = {}
-        self.instruct = instruct
         self.allowed_special = allowed_special
-        self.inflect_parser = inflect.engine()
         self.use_ttsfrd = use_ttsfrd
         if self.use_ttsfrd:
             self.frd = ttsfrd.TtsFrontendEngine()
@@ -69,14 +68,26 @@ class CosyVoiceFrontEnd:
                 'failed to initialize ttsfrd resource'
             self.frd.set_lang_type('pinyinvg')
         else:
-            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
             self.en_tn_model = EnNormalizer()
+            self.inflect_parser = inflect.engine()
     def _extract_text_token(self, text):
-        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
-        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
-        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
-        return text_token, text_token_len
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will return _extract_text_token_generator!')
+            # NOTE add a dummy text_token_len for compatibility
+            return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
+        else:
+            text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+            text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+            text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+            return text_token, text_token_len
+    def _extract_text_token_generator(self, text_generator):
+        for text in text_generator:
+            text_token, _ = self._extract_text_token(text)
+            for i in range(text_token.shape[1]):
+                yield text_token[:, i: i + 1]
     def _extract_speech_token(self, speech):
         assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
@@ -108,14 +119,17 @@ class CosyVoiceFrontEnd:
         return speech_feat, speech_feat_len
     def text_normalize(self, text, split=True, text_frontend=True):
-        if text_frontend is False:
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will skip text_normalize!')
+            return [text]
+        if text_frontend is False or text == '':
             return [text] if split is True else text
         text = text.strip()
-        if contains_chinese(text):
-            if self.use_ttsfrd:
-                texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
-                text = ''.join(texts)
-            else:
+        if self.use_ttsfrd:
+            texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+            text = ''.join(texts)
+        else:
+            if contains_chinese(text):
                 text = self.zh_tn_model.normalize(text)
                 text = text.replace("\n", "")
                 text = replace_blank(text)
@@ -126,18 +140,13 @@ class CosyVoiceFrontEnd:
                 text = re.sub(r'[，,、]+$', '。', text)
                 texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
                                              token_min_n=60, merge_len=20, comma_split=False))
-        else:
-            if self.use_ttsfrd:
-                texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
-                text = ''.join(texts)
             else:
                 text = self.en_tn_model.normalize(text)
                 text = spell_out_number(text, self.inflect_parser)
                 texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
                                              token_min_n=60, merge_len=20, comma_split=False))
-        if split is False:
-            return text
-        return texts
+        texts = [i for i in texts if not is_only_punctuation(i)]
+        return texts if split is True else text
     def frontend_sft(self, tts_text, spk_id):
         tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
@@ -145,28 +154,32 @@ class CosyVoiceFrontEnd:
         model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
         return model_input
-    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
-        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
-        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
-        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
-        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
-        if resample_rate == 24000:
-            # cosyvoice2, force speech_feat % speech_token = 2
-            token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
-            speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
-            speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
-        embedding = self._extract_spk_embedding(prompt_speech_16k)
-        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
-                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
-                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
-                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
-                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
-                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        if zero_shot_spk_id == '':
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+            prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+            speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+            speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+            if resample_rate == 24000:
+                # cosyvoice2, force speech_feat % speech_token = 2
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+                speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+            embedding = self._extract_spk_embedding(prompt_speech_16k)
+            model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                           'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                           'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                           'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                           'llm_embedding': embedding, 'flow_embedding': embedding}
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        model_input['text'] = tts_text_token
+        model_input['text_len'] = tts_text_token_len
         return model_input
-    def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
-        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
         # in cross lingual mode, we remove prompt in llm
         del model_input['prompt_text']
         del model_input['prompt_text_len']
@@ -183,23 +196,10 @@ class CosyVoiceFrontEnd:
         model_input['prompt_text_len'] = instruct_text_token_len
         return model_input
-    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
-        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
-        prompt_text_token, prompt_text_token_len = self._extract_text_token(instruct_text + '<|endofprompt|>')
-        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
-        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
-        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
-        if resample_rate == 24000:
-            # cosyvoice2, force speech_feat % speech_token = 2
-            token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
-            speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
-            speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
-        embedding = self._extract_spk_embedding(prompt_speech_16k)
-        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
-                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
-                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
-                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
-                       'llm_embedding': embedding, 'flow_embedding': embedding}
+    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
         return model_input
     def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):