PyPI - xinference - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl - Mend

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show

xinference/__init__.py +0 -1
xinference/_version.py +3 -3
xinference/api/restful_api.py +30 -5
xinference/client/restful/restful_client.py +18 -3
xinference/constants.py +0 -4
xinference/core/chat_interface.py +2 -2
xinference/core/image_interface.py +6 -3
xinference/core/model.py +9 -4
xinference/core/scheduler.py +4 -4
xinference/core/supervisor.py +2 -0
xinference/core/worker.py +7 -0
xinference/deploy/utils.py +6 -0
xinference/model/audio/core.py +9 -4
xinference/model/audio/cosyvoice.py +136 -0
xinference/model/audio/model_spec.json +24 -0
xinference/model/audio/model_spec_modelscope.json +27 -0
xinference/model/core.py +25 -4
xinference/model/embedding/core.py +88 -13
xinference/model/embedding/model_spec.json +8 -0
xinference/model/embedding/model_spec_modelscope.json +8 -0
xinference/model/flexible/core.py +8 -2
xinference/model/flexible/launchers/__init__.py +1 -0
xinference/model/flexible/launchers/image_process_launcher.py +70 -0
xinference/model/image/core.py +8 -5
xinference/model/image/model_spec.json +36 -5
xinference/model/image/model_spec_modelscope.json +21 -3
xinference/model/image/stable_diffusion/core.py +36 -28
xinference/model/llm/core.py +6 -4
xinference/model/llm/ggml/llamacpp.py +7 -5
xinference/model/llm/llm_family.json +802 -82
xinference/model/llm/llm_family.py +6 -6
xinference/model/llm/llm_family_csghub.json +39 -0
xinference/model/llm/llm_family_modelscope.json +295 -47
xinference/model/llm/mlx/core.py +7 -0
xinference/model/llm/pytorch/chatglm.py +246 -5
xinference/model/llm/pytorch/cogvlm2.py +1 -1
xinference/model/llm/pytorch/deepseek_vl.py +2 -1
xinference/model/llm/pytorch/falcon.py +2 -1
xinference/model/llm/pytorch/llama_2.py +4 -2
xinference/model/llm/pytorch/omnilmm.py +2 -1
xinference/model/llm/pytorch/qwen_vl.py +2 -1
xinference/model/llm/pytorch/vicuna.py +2 -1
xinference/model/llm/pytorch/yi_vl.py +2 -1
xinference/model/llm/sglang/core.py +12 -6
xinference/model/llm/utils.py +78 -1
xinference/model/llm/vllm/core.py +9 -5
xinference/model/rerank/core.py +4 -3
xinference/thirdparty/cosyvoice/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
xinference/thirdparty/cosyvoice/bin/train.py +136 -0
xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
xinference/thirdparty/cosyvoice/cli/model.py +60 -0
xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
xinference/thirdparty/cosyvoice/utils/common.py +103 -0
xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.95c1d652.js → main.af906659.js} +3 -3
xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/METADATA +39 -11
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/RECORD +101 -57
xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
/xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/LICENSE +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/WHEEL +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/entry_points.txt +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/top_level.txt +0 -0

xinference/thirdparty/cosyvoice/bin/inference.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text = batch["text"]
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_text = batch["tts_text"]
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            model_output = model.inference(**model_input)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

xinference/thirdparty/cosyvoice/bin/train.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+import deepspeed
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    save_model(model, 'init', info_dict)
+    # Get executor
+    executor = Executor()
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

xinference/thirdparty/cosyvoice/cli/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/cosyvoice/cli/cosyvoice.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.model import CosyVoiceModel
+class CosyVoice:
+    def __init__(self, model_dir):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          instruct,
+                                          configs['allowed_special'])
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        del configs
+    def list_avaliable_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def inference_sft(self, tts_text, spk_id):
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k):
+        if self.frontend.instruct is True:
+            raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_instruct(self, tts_text, spk_id, instruct_text):
+        if self.frontend.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}

xinference/thirdparty/cosyvoice/cli/frontend.py ADDED Viewed

@@ -0,0 +1,168 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use WeTextProcessing instead")
+    from tn.chinese.normalizer import Normalizer as ZhNormalizer
+    from tn.english.normalizer import Normalizer as EnNormalizer
+    use_ttsfrd = False
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 instruct: bool = False,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyin')
+            self.frd.enable_pinyin_mix(True)
+            self.frd.set_breakmodel_index(1)
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.en_tn_model = EnNormalizer()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+        return text_token, text_token_len
+    def _extract_speech_token(self, speech):
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
+                                                                self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.zh_tn_model.normalize(text)
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "、")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,]+$', '。', text)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        else:
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        if split is False:
+            return text
+        return texts
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input

xinference/thirdparty/cosyvoice/cli/model.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+        self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
+        self.flow.to(self.device).eval()
+        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
+        self.hift.to(self.device).eval()
+    def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_speech_token = self.llm.inference(text=text.to(self.device),
+                                              text_len=text_len.to(self.device),
+                                              prompt_text=prompt_text.to(self.device),
+                                              prompt_text_len=prompt_text_len.to(self.device),
+                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
+                                              embedding=llm_embedding.to(self.device),
+                                              beam_size=1,
+                                              sampling=25,
+                                              max_token_text_ratio=30,
+                                              min_token_text_ratio=3)
+        tts_mel = self.flow.inference(token=tts_speech_token,
+                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=flow_prompt_speech_token.to(self.device),
+                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
+                                      prompt_feat=prompt_speech_feat.to(self.device),
+                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=flow_embedding.to(self.device))
+        tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
+        return {'tts_speech': tts_speech}

xinference/thirdparty/cosyvoice/dataset/__init__.py ADDED Viewed

File without changes

xinference 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl

Potentially problematic release.

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl