PyPI - xinference - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/thirdparty/cosyvoice/dataset/dataset.py CHANGED Viewed

@@ -14,14 +14,13 @@
 # limitations under the License.
 import random
-import json
 import math
 from functools import partial
 import torch
 import torch.distributed as dist
 from torch.utils.data import IterableDataset
-from cosyvoice.utils.file_utils import read_lists, read_json_lists
+from cosyvoice.utils.file_utils import read_lists
 class Processor(IterableDataset):
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
             data_pipeline,
             mode='train',
             gan=False,
+            dpo=False,
             shuffle=True,
-            partition=True,
-            tts_file='',
-            prompt_utt2data=''):
+            partition=True):
     """ Construct dataset from arguments
         We have two shuffle stage in the Dataset. The first is global
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
             tokenizer (BaseTokenizer): tokenizer to tokenize
             partition(bool): whether to do data partition in terms of rank
     """
-    assert mode in ['train', 'inference']
     lists = read_lists(data_list_file)
-    if mode == 'inference':
-        with open(tts_file) as f:
-            tts_data = json.load(f)
-        utt2lists = read_json_lists(prompt_utt2data)
-        # filter unnecessary file in inference mode
-        lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
     dataset = DataList(lists,
                        shuffle=shuffle,
                        partition=partition)
-    if mode == 'inference':
-        # map partial arg to parquet_opener func in inference mode
-        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
-    if gan is True:
-        # map partial arg to padding func in gan mode
-        data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
+    # map partial arg to padding func
+    data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
     for func in data_pipeline:
         dataset = Processor(dataset, func, mode=mode)
     return dataset

xinference/thirdparty/cosyvoice/dataset/processor.py CHANGED Viewed

@@ -43,8 +43,6 @@ def parquet_opener(data, mode='train', tts_data={}):
             for df in pq.ParquetFile(url).iter_batches(batch_size=64):
                 df = df.to_pandas()
                 for i in range(len(df)):
-                    if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
-                        continue
                     sample.update(dict(df.loc[i]))
                     if mode == 'train':
                         # NOTE do not return sample directly, must initialize a new dict
@@ -100,6 +98,8 @@ def filter(data,
             continue
         if len(sample['speech_token']) == 0:
             continue
+        if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
+            continue
         if num_frames != 0:
             if len(sample['text_token']) / num_frames < min_output_input_ratio:
                 continue
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
 def compute_fbank(data,
                   feat_extractor,
+                  token_mel_ratio=0,
                   mode='train'):
     """ Extract fbank
@@ -174,8 +175,13 @@ def compute_fbank(data,
         assert 'utt' in sample
         assert 'text_token' in sample
         waveform = sample['speech']
-        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
-        sample['speech_feat'] = mat
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        if token_mel_ratio != 0:
+            # trim to align speech_token and speech_feat
+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            feat = feat[:token_mel_ratio * token_len]
+            sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_feat'] = feat
         yield sample
@@ -236,8 +242,6 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'):
     for sample in data:
         assert 'text' in sample
         sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
-        if mode == 'inference':
-            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
         yield sample
@@ -345,18 +349,15 @@ def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
 def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
     """ Wrapper for static/dynamic batch
     """
-    if mode == 'inference':
-        return static_batch(data, 1)
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch)
     else:
-        if batch_type == 'static':
-            return static_batch(data, batch_size)
-        elif batch_type == 'dynamic':
-            return dynamic_batch(data, max_frames_in_batch)
-        else:
-            logging.fatal('Unsupported batch type {}'.format(batch_type))
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
-def padding(data, use_spk_embedding, mode='train', gan=False):
+def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
     """ Padding the data into training data
         Args:
@@ -418,16 +419,14 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
             # only gan train needs speech, delete it to save memory
             del batch["speech"]
             del batch["speech_len"]
-        if mode == 'inference':
-            tts_text = [sample[i]['tts_text'] for i in order]
-            tts_index = [sample[i]['tts_index'] for i in order]
-            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
-            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
-            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
-            batch.update({'tts_text': tts_text,
-                          'tts_index': tts_index,
-                          'tts_text_token': tts_text_token,
-                          'tts_text_token_len': tts_text_token_len})
+        if dpo is True:
+            reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
+            reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
+            reject_speech_token = pad_sequence(reject_speech_token,
+                                               batch_first=True,
+                                               padding_value=0)
+            batch['reject_speech_token'] = reject_speech_token
+            batch['reject_speech_token_len'] = reject_speech_token_len
         if use_spk_embedding is True:
             batch["embedding"] = batch["spk_embedding"]
         else: