PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/thirdparty/cosyvoice/cli/model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +22,8 @@ from torch.nn import functional as F
 from contextlib import nullcontext
 import uuid
 from cosyvoice.utils.common import fade_in_out
-from cosyvoice.utils.file_utils import convert_onnx_to_trt
+from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
+from cosyvoice.utils.common import TrtContextWrapper
 class CosyVoiceModel:
@@ -80,30 +82,28 @@ class CosyVoiceModel:
         flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
         self.flow.encoder = flow_encoder
-    def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
+    def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, trt_concurrent, fp16):
         assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
-        if not os.path.exists(flow_decoder_estimator_model):
+        if not os.path.exists(flow_decoder_estimator_model) or os.path.getsize(flow_decoder_estimator_model) == 0:
             convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
-        if os.path.getsize(flow_decoder_estimator_model) == 0:
-            raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
         del self.flow.decoder.estimator
         import tensorrt as trt
         with open(flow_decoder_estimator_model, 'rb') as f:
-            self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
-        assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
-        self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
+            estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
+        assert estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
+        self.flow.decoder.estimator = TrtContextWrapper(estimator_engine, trt_concurrent=trt_concurrent, device=self.device)
     def get_trt_kwargs(self):
         min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
-        opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
+        opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
         max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
         input_names = ["x", "mask", "mu", "cond"]
         return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
     def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
-        with self.llm_context, torch.cuda.amp.autocast(self.fp16):
+        with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
             if isinstance(text, Generator):
-                assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
+                assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
                 for i in self.llm.inference_bistream(text=text,
                                                      prompt_text=prompt_text.to(self.device),
                                                      prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
@@ -118,7 +118,8 @@ class CosyVoiceModel:
                                             prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
                                             prompt_speech_token=llm_prompt_speech_token.to(self.device),
                                             prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
-                                            embedding=llm_embedding.to(self.device)):
+                                            embedding=llm_embedding.to(self.device),
+                                            uuid=uuid):
                     self.tts_speech_token_dict[uuid].append(i)
         self.llm_end_dict[uuid] = True
@@ -231,7 +232,9 @@ class CosyVoiceModel:
             self.mel_overlap_dict.pop(this_uuid)
             self.hift_cache_dict.pop(this_uuid)
             self.flow_cache_dict.pop(this_uuid)
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.current_stream().synchronize()
 class CosyVoice2Model(CosyVoiceModel):
@@ -240,20 +243,17 @@ class CosyVoice2Model(CosyVoiceModel):
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module,
-                 fp16: bool = False,
-                 use_flow_cache: bool = False):
+                 fp16: bool = False):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.llm = llm
         self.flow = flow
         self.hift = hift
         self.fp16 = fp16
-        self.use_flow_cache = use_flow_cache
         if self.fp16 is True:
             self.llm.half()
             self.flow.half()
-        # stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
+        # NOTE must matching training static_chunk_size
         self.token_hop_len = 25
-        self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
         # hift cache
         self.mel_cache_len = 8
         self.source_cache_len = int(self.mel_cache_len * 480)
@@ -265,55 +265,35 @@ class CosyVoice2Model(CosyVoiceModel):
         # dict used to store session related variable
         self.tts_speech_token_dict = {}
         self.llm_end_dict = {}
-        self.flow_cache_dict = {}
         self.hift_cache_dict = {}
-    def init_flow_cache(self):
-        encoder_cache = {'offset': 0,
-                         'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
-                         'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
-                         'upsample_offset': 0,
-                         'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
-                         'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
-        decoder_cache = {'offset': 0,
-                         'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
-                         'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
-                         'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
-                         'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
-                         'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
-                         'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
-                         'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
-        if self.fp16 is True:
-            for cache in [encoder_cache, decoder_cache]:
-                for k, v in cache.items():
-                    if isinstance(v, torch.Tensor):
-                        cache[k] = v.half()
-        cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
-        return cache
     def load_jit(self, flow_encoder_model):
         flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
         self.flow.encoder = flow_encoder
-    def get_trt_kwargs(self):
-        min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
-        opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
-        max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
-        input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
-        assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
-        return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
+    def load_vllm(self, model_dir):
+        export_cosyvoice2_vllm(self.llm, model_dir, self.device)
+        from vllm import EngineArgs, LLMEngine
+        engine_args = EngineArgs(model=model_dir,
+                                 skip_tokenizer_init=True,
+                                 enable_prompt_embeds=True,
+                                 gpu_memory_utilization=0.2)
+        self.llm.vllm = LLMEngine.from_engine_args(engine_args)
+        self.llm.lock = threading.Lock()
+        del self.llm.llm.model.model.layers
-    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
         with torch.cuda.amp.autocast(self.fp16):
-            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
-                                                                      token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
-                                                                      prompt_token=prompt_token.to(self.device),
-                                                                      prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
-                                                                      prompt_feat=prompt_feat.to(self.device),
-                                                                      prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
-                                                                      embedding=embedding.to(self.device),
-                                                                      cache=self.flow_cache_dict[uuid],
-                                                                      finalize=finalize)
+            tts_mel, _ = self.flow.inference(token=token.to(self.device),
+                                             token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_token=prompt_token.to(self.device),
+                                             prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_feat=prompt_feat.to(self.device),
+                                             prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                             embedding=embedding.to(self.device),
+                                             streaming=stream,
+                                             finalize=finalize)
+        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
         # append hift cache
         if self.hift_cache_dict[uuid] is not None:
             hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
@@ -348,34 +328,30 @@ class CosyVoice2Model(CosyVoiceModel):
         with self.lock:
             self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
             self.hift_cache_dict[this_uuid] = None
-            self.flow_cache_dict[this_uuid] = self.init_flow_cache()
         if source_speech_token.shape[1] == 0:
             p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
         else:
             p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
         p.start()
         if stream is True:
-            assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
-            # NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
-            flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
-            prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
+            token_offset = 0
+            prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
             while True:
                 time.sleep(0.1)
-                if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
-                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
+                this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len
+                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
                     this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                                      prompt_token=flow_prompt_speech_token,
                                                      prompt_feat=prompt_speech_feat,
                                                      embedding=flow_embedding,
+                                                     token_offset=token_offset,
                                                      uuid=this_uuid,
+                                                     stream=stream,
                                                      finalize=False)
-                    # NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
-                    flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
-                    prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
+                    token_offset += this_token_hop_len
                     yield {'tts_speech': this_tts_speech.cpu()}
-                    with self.lock:
-                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
-                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
                     break
             p.join()
             # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
@@ -384,18 +360,19 @@ class CosyVoice2Model(CosyVoiceModel):
                                              prompt_token=flow_prompt_speech_token,
                                              prompt_feat=prompt_speech_feat,
                                              embedding=flow_embedding,
+                                             token_offset=token_offset,
                                              uuid=this_uuid,
                                              finalize=True)
             yield {'tts_speech': this_tts_speech.cpu()}
         else:
             # deal with all tokens
-            assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
             p.join()
             this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
             this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                              prompt_token=flow_prompt_speech_token,
                                              prompt_feat=prompt_speech_feat,
                                              embedding=flow_embedding,
+                                             token_offset=0,
                                              uuid=this_uuid,
                                              finalize=True,
                                              speed=speed)
@@ -404,5 +381,6 @@ class CosyVoice2Model(CosyVoiceModel):
             self.tts_speech_token_dict.pop(this_uuid)
             self.llm_end_dict.pop(this_uuid)
             self.hift_cache_dict.pop(this_uuid)
-            self.flow_cache_dict.pop(this_uuid)
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.current_stream().synchronize()

xinference/thirdparty/cosyvoice/dataset/dataset.py CHANGED Viewed

@@ -14,14 +14,13 @@
 # limitations under the License.
 import random
-import json
 import math
 from functools import partial
 import torch
 import torch.distributed as dist
 from torch.utils.data import IterableDataset
-from cosyvoice.utils.file_utils import read_lists, read_json_lists
+from cosyvoice.utils.file_utils import read_lists
 class Processor(IterableDataset):
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
             data_pipeline,
             mode='train',
             gan=False,
+            dpo=False,
             shuffle=True,
-            partition=True,
-            tts_file='',
-            prompt_utt2data=''):
+            partition=True):
     """ Construct dataset from arguments
         We have two shuffle stage in the Dataset. The first is global
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
             tokenizer (BaseTokenizer): tokenizer to tokenize
             partition(bool): whether to do data partition in terms of rank
     """
-    assert mode in ['train', 'inference']
     lists = read_lists(data_list_file)
-    if mode == 'inference':
-        with open(tts_file) as f:
-            tts_data = json.load(f)
-        utt2lists = read_json_lists(prompt_utt2data)
-        # filter unnecessary file in inference mode
-        lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
     dataset = DataList(lists,
                        shuffle=shuffle,
                        partition=partition)
-    if mode == 'inference':
-        # map partial arg to parquet_opener func in inference mode
-        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
-    if gan is True:
-        # map partial arg to padding func in gan mode
-        data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
+    # map partial arg to padding func
+    data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
     for func in data_pipeline:
         dataset = Processor(dataset, func, mode=mode)
     return dataset

xinference/thirdparty/cosyvoice/dataset/processor.py CHANGED Viewed

@@ -43,8 +43,6 @@ def parquet_opener(data, mode='train', tts_data={}):
             for df in pq.ParquetFile(url).iter_batches(batch_size=64):
                 df = df.to_pandas()
                 for i in range(len(df)):
-                    if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
-                        continue
                     sample.update(dict(df.loc[i]))
                     if mode == 'train':
                         # NOTE do not return sample directly, must initialize a new dict
@@ -100,6 +98,8 @@ def filter(data,
             continue
         if len(sample['speech_token']) == 0:
             continue
+        if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
+            continue
         if num_frames != 0:
             if len(sample['text_token']) / num_frames < min_output_input_ratio:
                 continue
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
 def compute_fbank(data,
                   feat_extractor,
+                  token_mel_ratio=0,
                   mode='train'):
     """ Extract fbank
@@ -174,8 +175,13 @@ def compute_fbank(data,
         assert 'utt' in sample
         assert 'text_token' in sample
         waveform = sample['speech']
-        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
-        sample['speech_feat'] = mat
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        if token_mel_ratio != 0:
+            # trim to align speech_token and speech_feat
+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            feat = feat[:token_mel_ratio * token_len]
+            sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_feat'] = feat
         yield sample
@@ -236,8 +242,6 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'):
     for sample in data:
         assert 'text' in sample
         sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
-        if mode == 'inference':
-            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
         yield sample
@@ -345,18 +349,15 @@ def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
 def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
     """ Wrapper for static/dynamic batch
     """
-    if mode == 'inference':
-        return static_batch(data, 1)
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch)
     else:
-        if batch_type == 'static':
-            return static_batch(data, batch_size)
-        elif batch_type == 'dynamic':
-            return dynamic_batch(data, max_frames_in_batch)
-        else:
-            logging.fatal('Unsupported batch type {}'.format(batch_type))
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
-def padding(data, use_spk_embedding, mode='train', gan=False):
+def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
     """ Padding the data into training data
         Args:
@@ -418,16 +419,14 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
             # only gan train needs speech, delete it to save memory
             del batch["speech"]
             del batch["speech_len"]
-        if mode == 'inference':
-            tts_text = [sample[i]['tts_text'] for i in order]
-            tts_index = [sample[i]['tts_index'] for i in order]
-            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
-            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
-            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
-            batch.update({'tts_text': tts_text,
-                          'tts_index': tts_index,
-                          'tts_text_token': tts_text_token,
-                          'tts_text_token_len': tts_text_token_len})
+        if dpo is True:
+            reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
+            reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
+            reject_speech_token = pad_sequence(reject_speech_token,
+                                               batch_first=True,
+                                               padding_value=0)
+            batch['reject_speech_token'] = reject_speech_token
+            batch['reject_speech_token_len'] = reject_speech_token_len
         if use_spk_embedding is True:
             batch["embedding"] = batch["spk_embedding"]
         else:

xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl