PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/thirdparty/cosyvoice/bin/export_jit.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+import torch
+from cosyvoice.cli.cosyvoice import CosyVoice
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_trt=False)
+    # 1. export llm text_encoder
+    llm_text_encoder = cosyvoice.model.llm.text_encoder.half()
+    script = torch.jit.script(llm_text_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+    # 2. export llm llm
+    llm_llm = cosyvoice.model.llm.llm.half()
+    script = torch.jit.script(llm_llm)
+    script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+if __name__ == '__main__':
+    main()

xinference/thirdparty/cosyvoice/bin/export_trt.py ADDED Viewed

@@ -0,0 +1,8 @@
+# TODO 跟export_jit一样的逻辑，完成flow部分的estimator的onnx导出。
+# tensorrt的安装方式，再这里写一下步骤提示如下，如果没有安装，那么不要执行这个脚本，提示用户先安装，不给选择
+try:
+    import tensorrt
+except ImportError:
+    print('step1, 下载\n step2. 解压，安装whl，')
+# 安装命令里tensosrt的根目录用环境变量导入，比如os.environ['tensorrt_root_dir']/bin/exetrace，然后python里subprocess里执行导出命令
+# 后面我会在run.sh里写好执行命令 tensorrt_root_dir=xxxx python cosyvoice/bin/export_trt.py --model_dir xxx

xinference/thirdparty/cosyvoice/bin/inference.py CHANGED Viewed

@@ -100,10 +100,13 @@ def main():
                                'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
                                'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                                'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
-            model_output = model.inference(**model_input)
+            tts_speeches = []
+            for model_output in model.inference(**model_input):
+                tts_speeches.append(model_output['tts_speech'])
+            tts_speeches = torch.concat(tts_speeches, dim=1)
             tts_key = '{}_{}'.format(utts[0], tts_index[0])
             tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
-            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
             f.write('{} {}\n'.format(tts_key, tts_fn))
             f.flush()
     f.close()

xinference/thirdparty/cosyvoice/cli/cosyvoice.py CHANGED Viewed

@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import torch
+import time
 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.utils.file_utils import logging
 class CosyVoice:
-    def __init__(self, model_dir):
+    def __init__(self, model_dir, load_jit=True):
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
@@ -38,46 +39,61 @@ class CosyVoice:
         self.model.load('{}/llm.pt'.format(model_dir),
                         '{}/flow.pt'.format(model_dir),
                         '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
+                                    '{}/llm.llm.fp16.zip'.format(model_dir))
         del configs
     def list_avaliable_spks(self):
         spks = list(self.frontend.spk2info.keys())
         return spks
-    def inference_sft(self, tts_text, spk_id):
-        tts_speeches = []
+    def inference_sft(self, tts_text, spk_id, stream=False):
         for i in self.frontend.text_normalize(tts_text, split=True):
             model_input = self.frontend.frontend_sft(i, spk_id)
-            model_output = self.model.inference(**model_input)
-            tts_speeches.append(model_output['tts_speech'])
-        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
-    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False):
         prompt_text = self.frontend.text_normalize(prompt_text, split=False)
-        tts_speeches = []
         for i in self.frontend.text_normalize(tts_text, split=True):
             model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
-            model_output = self.model.inference(**model_input)
-            tts_speeches.append(model_output['tts_speech'])
-        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
-    def inference_cross_lingual(self, tts_text, prompt_speech_16k):
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False):
         if self.frontend.instruct is True:
             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
-        tts_speeches = []
         for i in self.frontend.text_normalize(tts_text, split=True):
             model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
-            model_output = self.model.inference(**model_input)
-            tts_speeches.append(model_output['tts_speech'])
-        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
-    def inference_instruct(self, tts_text, spk_id, instruct_text):
+    def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
-        tts_speeches = []
         for i in self.frontend.text_normalize(tts_text, split=True):
             model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
-            model_output = self.model.inference(**model_input)
-            tts_speeches.append(model_output['tts_speech'])
-        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()

xinference/thirdparty/cosyvoice/cli/model.py CHANGED Viewed

@@ -12,6 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
+import numpy as np
+import threading
+import time
+from contextlib import nullcontext
+import uuid
+from cosyvoice.utils.common import fade_in_out
 class CosyVoiceModel:
@@ -23,38 +30,144 @@ class CosyVoiceModel:
         self.llm = llm
         self.flow = flow
         self.hift = hift
+        self.token_min_hop_len = 100
+        self.token_max_hop_len = 200
+        self.token_overlap_len = 20
+        # mel fade in out
+        self.mel_overlap_len = 34
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.flow_hift_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.hift_cache_dict = {}
     def load(self, llm_model, flow_model, hift_model):
         self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
         self.llm.to(self.device).eval()
+        self.llm.half()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
         self.flow.to(self.device).eval()
         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
         self.hift.to(self.device).eval()
-    def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
-                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
-                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
-                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
-                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
-        tts_speech_token = self.llm.inference(text=text.to(self.device),
-                                              text_len=text_len.to(self.device),
-                                              prompt_text=prompt_text.to(self.device),
-                                              prompt_text_len=prompt_text_len.to(self.device),
-                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
-                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
-                                              embedding=llm_embedding.to(self.device),
-                                              beam_size=1,
-                                              sampling=25,
-                                              max_token_text_ratio=30,
-                                              min_token_text_ratio=3)
-        tts_mel = self.flow.inference(token=tts_speech_token,
-                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
-                                      prompt_token=flow_prompt_speech_token.to(self.device),
-                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
-                                      prompt_feat=prompt_speech_feat.to(self.device),
-                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
-                                      embedding=flow_embedding.to(self.device))
-        tts_speech = self.hift.inference(mel=tts_mel).cpu()
-        torch.cuda.empty_cache()
-        return {'tts_speech': tts_speech}
+    def load_jit(self, llm_text_encoder_model, llm_llm_model):
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model)
+        self.llm.llm = llm_llm
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+        with self.llm_context:
+            for i in self.llm.inference(text=text.to(self.device),
+                                                text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                                                prompt_text=prompt_text.to(self.device),
+                                                prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                                prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                                prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                embedding=llm_embedding.to(self.device).half(),
+                                                sampling=25,
+                                                max_token_text_ratio=30,
+                                                min_token_text_ratio=3):
+                self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False):
+        with self.flow_hift_context:
+            tts_mel = self.flow.inference(token=token.to(self.device),
+                                        token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_token=prompt_token.to(self.device),
+                                        prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_feat=prompt_feat.to(self.device),
+                                        prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                        embedding=embedding.to(self.device))
+            # mel overlap fade in out
+            # if self.mel_overlap_dict[uuid] is not None:
+            #     tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
+            # append hift cache
+            if self.hift_cache_dict[uuid] is not None:
+                hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+                tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+            else:
+                hift_cache_source = torch.zeros(1, 1, 0)
+            # keep overlap mel and hift cache
+            if finalize is False:
+                self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
+                tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
+                tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
+                self.hift_cache_dict[uuid] = {'source': tts_source[:, :, -self.source_cache_len:], 'mel': tts_mel[:, :, -self.mel_cache_len:]}
+                tts_speech = tts_speech[:, :-self.source_cache_len]
+            else:
+                tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
+        return tts_speech
+    def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid], self.mel_overlap_dict[this_uuid], self.hift_cache_dict[this_uuid] = [], False, None, None
+        p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len], dim=1)
+                    with self.flow_hift_context:
+                        this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                    prompt_token=flow_prompt_speech_token,
+                                                    prompt_feat=prompt_speech_feat,
+                                                    embedding=flow_embedding,
+                                                    uuid=this_uuid,
+                                                    finalize=False)
+                    yield  {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
+            with self.flow_hift_context:
+                this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                            prompt_token=flow_prompt_speech_token,
+                                            prompt_feat=prompt_speech_feat,
+                                            embedding=flow_embedding,
+                                            uuid=this_uuid,
+                                            finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
+            with self.flow_hift_context:
+                this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                            prompt_token=flow_prompt_speech_token,
+                                            prompt_feat=prompt_speech_feat,
+                                            embedding=flow_embedding,
+                                            uuid=this_uuid,
+                                            finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+        if torch.cuda.is_initialized():
+            torch.cuda.synchronize()

xinference/thirdparty/cosyvoice/flow/flow.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import random
 from typing import Dict, Optional
 import torch
 import torch.nn as nn
@@ -77,6 +78,11 @@ class MaskedDiffWithXvec(torch.nn.Module):
         # get conditions
         conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
         conds = conds.transpose(1, 2)
         mask = (~make_pad_mask(feat_len)).to(h)
@@ -105,6 +111,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         embedding = self.spk_embed_affine_layer(embedding)
         # concat text and prompt_text
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
         token = self.input_embedding(torch.clamp(token, min=0)) * mask
@@ -112,17 +119,16 @@ class MaskedDiffWithXvec(torch.nn.Module):
         # text encode
         h, h_lengths = self.encoder(token, token_len)
         h = self.encoder_proj(h)
-        feat_len = (token_len / 50 * 22050 / 256).int()
-        h, h_lengths = self.length_regulator(h, feat_len)
+        mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / 50 * 22050 / 256)
+        h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2)
         # get conditions
-        conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
-        if prompt_feat.shape[1] != 0:
-            for i, j in enumerate(prompt_feat_len):
-                conds[i, :j] = prompt_feat[i]
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
+        conds[:, :mel_len1] = prompt_feat
         conds = conds.transpose(1, 2)
-        mask = (~make_pad_mask(feat_len)).to(h)
+        # mask = (~make_pad_mask(feat_len)).to(h)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
         feat = self.decoder(
             mu=h.transpose(1, 2).contiguous(),
             mask=mask.unsqueeze(1),
@@ -130,6 +136,6 @@ class MaskedDiffWithXvec(torch.nn.Module):
             cond=conds,
             n_timesteps=10
         )
-        if prompt_feat.shape[1] != 0:
-            feat = feat[:, :, prompt_feat.shape[1]:]
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
         return feat

xinference/thirdparty/cosyvoice/flow/length_regulator.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import Tuple
 import torch.nn as nn
+import torch
 from torch.nn import functional as F
 from cosyvoice.utils.mask import make_pad_mask
@@ -43,7 +44,25 @@ class InterpolateRegulator(nn.Module):
     def forward(self, x, ylens=None):
         # x in (B, T, D)
         mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
-        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
         out = self.model(x).transpose(1, 2).contiguous()
         olens = ylens
         return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=34, mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - 34 * 2, mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=34, mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2

xinference/thirdparty/cosyvoice/hifigan/generator.py CHANGED Viewed

@@ -335,10 +335,14 @@ class HiFTGenerator(nn.Module):
         inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
         return inverse_transform
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
         f0 = self.f0_predictor(x)
         s = self._f02source(f0)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] == 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
         s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
         s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
@@ -370,7 +374,7 @@ class HiFTGenerator(nn.Module):
         x = self._istft(magnitude, phase)
         x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x
+        return x, s
     def remove_weight_norm(self):
         print('Removing weight norm...')
@@ -387,5 +391,5 @@ class HiFTGenerator(nn.Module):
             l.remove_weight_norm()
     @torch.inference_mode()
-    def inference(self, mel: torch.Tensor) -> torch.Tensor:
-        return self.forward(x=mel)
+    def inference(self, mel: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        return self.forward(x=mel, cache_source=cache_source)

xinference/thirdparty/cosyvoice/llm/llm.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Callable, List, Generator
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -31,6 +31,7 @@ class TransformerLM(torch.nn.Module):
             speech_token_size: int,
             text_encoder: torch.nn.Module,
             llm: torch.nn.Module,
+            sampling: Callable,
             length_normalized_loss: bool = True,
             lsm_weight: float = 0.0,
             spk_embed_dim: int = 192,
@@ -63,6 +64,9 @@ class TransformerLM(torch.nn.Module):
         self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size)
         self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
     def encode(
             self,
             text: torch.Tensor,
@@ -132,14 +136,12 @@ class TransformerLM(torch.nn.Module):
     def sampling_ids(
             self,
             weighted_scores: torch.Tensor,
-            sampling: Union[bool, int, float] = True,
-            beam_size: int = 1,
+            decoded_tokens: List,
+            sampling: int,
             ignore_eos: bool = True,
     ):
         while True:
-            prob, indices = weighted_scores.softmax(dim=-1).topk(sampling)
-            top_ids = prob.multinomial(beam_size, replacement=True)
-            top_ids = indices[top_ids]
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
             if (not ignore_eos) or (self.speech_token_size not in top_ids):
                 break
         return top_ids
@@ -154,11 +156,10 @@ class TransformerLM(torch.nn.Module):
             prompt_speech_token: torch.Tensor,
             prompt_speech_token_len: torch.Tensor,
             embedding: torch.Tensor,
-            beam_size: int = 1,
             sampling: int = 25,
             max_token_text_ratio: float = 20,
             min_token_text_ratio: float = 2,
-    ) -> torch.Tensor:
+    ) -> Generator[torch.Tensor, None, None]:
         device = text.device
         text = torch.concat([prompt_text, text], dim=1)
         text_len += prompt_text_len
@@ -173,7 +174,7 @@ class TransformerLM(torch.nn.Module):
             embedding = self.spk_embed_affine_layer(embedding)
             embedding = embedding.unsqueeze(dim=1)
         else:
-            embedding = torch.zeros(1, 0, self.llm_input_size).to(device)
+            embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
         # 3. concat llm_input
         sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
@@ -181,7 +182,7 @@ class TransformerLM(torch.nn.Module):
         if prompt_speech_token_len != 0:
             prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
         else:
-            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size).to(device)
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
         lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
         # 4. cal min/max_length
@@ -196,11 +197,11 @@ class TransformerLM(torch.nn.Module):
             y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache,
                                                                   att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool))
             logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
-            top_ids = self.sampling_ids(logp.squeeze(dim=0), sampling, beam_size, ignore_eos=True if i < min_len else False).item()
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
             if top_ids == self.speech_token_size:
                 break
+            # in stream mode, yield token one by one
+            yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
             out_tokens.append(top_ids)
             offset += lm_input.size(1)
             lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
-        return torch.tensor([out_tokens], dtype=torch.int64, device=device)

xinference/thirdparty/cosyvoice/transformer/attention.py CHANGED Viewed

@@ -222,7 +222,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         torch.nn.init.xavier_uniform_(self.pos_bias_u)
         torch.nn.init.xavier_uniform_(self.pos_bias_v)
-    def rel_shift(self, x):
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
         """Compute relative positional encoding.
         Args:
@@ -233,10 +233,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
             torch.Tensor: Output tensor.
         """
-        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
         x_padded = torch.cat([zero_pad, x], dim=-1)
-        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
         x = x_padded[:, :, 1:].view_as(x)[
             :, :, :, : x.size(-1) // 2 + 1
         ]  # only keep the positions from 0 to time2

xinference/thirdparty/cosyvoice/transformer/decoder.py CHANGED Viewed

@@ -174,7 +174,7 @@ class TransformerDecoder(torch.nn.Module):
                                                      memory_mask)
         return x
-    @torch.jit.ignore(drop=True)
+    @torch.jit.unused
     def forward_layers_checkpointed(self, x: torch.Tensor,
                                     tgt_mask: torch.Tensor,
                                     memory: torch.Tensor,

xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl