xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/model.py +3 -4
- xinference/core/supervisor.py +29 -1
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +64 -20
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +19 -4
- xinference/model/embedding/vllm/core.py +40 -8
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +116 -9
- xinference/model/image/stable_diffusion/core.py +141 -31
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +42 -40
- xinference/model/llm/llm_family.json +435 -23
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +6 -12
- xinference/model/llm/utils.py +128 -46
- xinference/model/llm/vllm/core.py +8 -61
- xinference/model/rerank/core.py +3 -0
- xinference/model/rerank/sentence_transformers/core.py +1 -1
- xinference/model/rerank/vllm/core.py +56 -6
- xinference/model/utils.py +1 -2
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/types.py +105 -2
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
- xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -62,135 +62,58 @@ def main():
|
|
|
62
62
|
model = CosyVoice(args.model_dir)
|
|
63
63
|
except Exception:
|
|
64
64
|
try:
|
|
65
|
-
|
|
66
|
-
model = CosyVoice2(args.model_dir, use_flow_cache=True)
|
|
65
|
+
model = CosyVoice2(args.model_dir)
|
|
67
66
|
except Exception:
|
|
68
67
|
raise TypeError('no valid model_type!')
|
|
69
68
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
logging.info('successfully export estimator')
|
|
119
|
-
else:
|
|
120
|
-
# 1. export flow decoder estimator
|
|
121
|
-
estimator = model.model.flow.decoder.estimator
|
|
122
|
-
estimator.forward = estimator.forward_chunk
|
|
123
|
-
estimator.eval()
|
|
124
|
-
|
|
125
|
-
device = model.model.device
|
|
126
|
-
batch_size, seq_len = 2, 256
|
|
127
|
-
out_channels = model.model.flow.decoder.estimator.out_channels
|
|
128
|
-
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
|
|
129
|
-
cache = model.model.init_flow_cache()['decoder_cache']
|
|
130
|
-
cache.pop('offset')
|
|
131
|
-
cache = {k: v[0] for k, v in cache.items()}
|
|
132
|
-
torch.onnx.export(
|
|
133
|
-
estimator,
|
|
134
|
-
(x, mask, mu, t, spks, cond,
|
|
135
|
-
cache['down_blocks_conv_cache'],
|
|
136
|
-
cache['down_blocks_kv_cache'],
|
|
137
|
-
cache['mid_blocks_conv_cache'],
|
|
138
|
-
cache['mid_blocks_kv_cache'],
|
|
139
|
-
cache['up_blocks_conv_cache'],
|
|
140
|
-
cache['up_blocks_kv_cache'],
|
|
141
|
-
cache['final_blocks_conv_cache']),
|
|
142
|
-
'{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
|
|
143
|
-
export_params=True,
|
|
144
|
-
opset_version=18,
|
|
145
|
-
do_constant_folding=True,
|
|
146
|
-
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache',
|
|
147
|
-
'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
|
|
148
|
-
output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out',
|
|
149
|
-
'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
|
|
150
|
-
dynamic_axes={
|
|
151
|
-
'x': {2: 'seq_len'},
|
|
152
|
-
'mask': {2: 'seq_len'},
|
|
153
|
-
'mu': {2: 'seq_len'},
|
|
154
|
-
'cond': {2: 'seq_len'},
|
|
155
|
-
'down_blocks_kv_cache': {3: 'cache_in_len'},
|
|
156
|
-
'mid_blocks_kv_cache': {3: 'cache_in_len'},
|
|
157
|
-
'up_blocks_kv_cache': {3: 'cache_in_len'},
|
|
158
|
-
'estimator_out': {2: 'seq_len'},
|
|
159
|
-
'down_blocks_kv_cache_out': {3: 'cache_out_len'},
|
|
160
|
-
'mid_blocks_kv_cache_out': {3: 'cache_out_len'},
|
|
161
|
-
'up_blocks_kv_cache_out': {3: 'cache_out_len'},
|
|
162
|
-
}
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
# 2. test computation consistency
|
|
166
|
-
option = onnxruntime.SessionOptions()
|
|
167
|
-
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
168
|
-
option.intra_op_num_threads = 1
|
|
169
|
-
providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
|
|
170
|
-
estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
|
|
171
|
-
sess_options=option, providers=providers)
|
|
172
|
-
|
|
173
|
-
for iter in tqdm(range(10)):
|
|
174
|
-
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
|
|
175
|
-
cache = model.model.init_flow_cache()['decoder_cache']
|
|
176
|
-
cache.pop('offset')
|
|
177
|
-
cache = {k: v[0] for k, v in cache.items()}
|
|
178
|
-
output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
|
|
179
|
-
ort_inputs = {
|
|
180
|
-
'x': x.cpu().numpy(),
|
|
181
|
-
'mask': mask.cpu().numpy(),
|
|
182
|
-
'mu': mu.cpu().numpy(),
|
|
183
|
-
't': t.cpu().numpy(),
|
|
184
|
-
'spks': spks.cpu().numpy(),
|
|
185
|
-
'cond': cond.cpu().numpy(),
|
|
186
|
-
}
|
|
187
|
-
output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
|
|
188
|
-
if iter == 0:
|
|
189
|
-
# NOTE why can not pass first iteration check?
|
|
190
|
-
continue
|
|
191
|
-
for i, j in zip(output_pytorch, output_onnx):
|
|
192
|
-
torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
|
|
193
|
-
logging.info('successfully export estimator')
|
|
69
|
+
# 1. export flow decoder estimator
|
|
70
|
+
estimator = model.model.flow.decoder.estimator
|
|
71
|
+
estimator.eval()
|
|
72
|
+
|
|
73
|
+
device = model.model.device
|
|
74
|
+
batch_size, seq_len = 2, 256
|
|
75
|
+
out_channels = model.model.flow.decoder.estimator.out_channels
|
|
76
|
+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
|
|
77
|
+
torch.onnx.export(
|
|
78
|
+
estimator,
|
|
79
|
+
(x, mask, mu, t, spks, cond),
|
|
80
|
+
'{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
|
|
81
|
+
export_params=True,
|
|
82
|
+
opset_version=18,
|
|
83
|
+
do_constant_folding=True,
|
|
84
|
+
input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
|
|
85
|
+
output_names=['estimator_out'],
|
|
86
|
+
dynamic_axes={
|
|
87
|
+
'x': {2: 'seq_len'},
|
|
88
|
+
'mask': {2: 'seq_len'},
|
|
89
|
+
'mu': {2: 'seq_len'},
|
|
90
|
+
'cond': {2: 'seq_len'},
|
|
91
|
+
'estimator_out': {2: 'seq_len'},
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# 2. test computation consistency
|
|
96
|
+
option = onnxruntime.SessionOptions()
|
|
97
|
+
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
98
|
+
option.intra_op_num_threads = 1
|
|
99
|
+
providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
|
|
100
|
+
estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
|
|
101
|
+
sess_options=option, providers=providers)
|
|
102
|
+
|
|
103
|
+
for _ in tqdm(range(10)):
|
|
104
|
+
x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
|
|
105
|
+
output_pytorch = estimator(x, mask, mu, t, spks, cond)
|
|
106
|
+
ort_inputs = {
|
|
107
|
+
'x': x.cpu().numpy(),
|
|
108
|
+
'mask': mask.cpu().numpy(),
|
|
109
|
+
'mu': mu.cpu().numpy(),
|
|
110
|
+
't': t.cpu().numpy(),
|
|
111
|
+
'spks': spks.cpu().numpy(),
|
|
112
|
+
'cond': cond.cpu().numpy()
|
|
113
|
+
}
|
|
114
|
+
output_onnx = estimator_onnx.run(None, ort_inputs)[0]
|
|
115
|
+
torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
|
|
116
|
+
logging.info('successfully export estimator')
|
|
194
117
|
|
|
195
118
|
|
|
196
119
|
if __name__ == "__main__":
|
|
@@ -27,6 +27,7 @@ from hyperpyyaml import load_hyperpyyaml
|
|
|
27
27
|
|
|
28
28
|
from torch.distributed.elastic.multiprocessing.errors import record
|
|
29
29
|
|
|
30
|
+
from cosyvoice.utils.losses import DPOLoss
|
|
30
31
|
from cosyvoice.utils.executor import Executor
|
|
31
32
|
from cosyvoice.utils.train_utils import (
|
|
32
33
|
init_distributed,
|
|
@@ -43,6 +44,7 @@ def get_args():
|
|
|
43
44
|
choices=['torch_ddp', 'deepspeed'],
|
|
44
45
|
help='Engine for paralleled training')
|
|
45
46
|
parser.add_argument('--model', required=True, help='model which will be trained')
|
|
47
|
+
parser.add_argument('--ref_model', required=False, help='ref model used in dpo')
|
|
46
48
|
parser.add_argument('--config', required=True, help='config file')
|
|
47
49
|
parser.add_argument('--train_data', required=True, help='train data file')
|
|
48
50
|
parser.add_argument('--cv_data', required=True, help='cv data file')
|
|
@@ -73,6 +75,10 @@ def get_args():
|
|
|
73
75
|
action='store_true',
|
|
74
76
|
default=False,
|
|
75
77
|
help='Use automatic mixed precision training')
|
|
78
|
+
parser.add_argument('--dpo',
|
|
79
|
+
action='store_true',
|
|
80
|
+
default=False,
|
|
81
|
+
help='Use Direct Preference Optimization')
|
|
76
82
|
parser.add_argument('--deepspeed.save_states',
|
|
77
83
|
dest='save_states',
|
|
78
84
|
default='model_only',
|
|
@@ -113,7 +119,7 @@ def main():
|
|
|
113
119
|
|
|
114
120
|
# Get dataset & dataloader
|
|
115
121
|
train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
|
|
116
|
-
init_dataset_and_dataloader(args, configs, gan)
|
|
122
|
+
init_dataset_and_dataloader(args, configs, gan, args.dpo)
|
|
117
123
|
|
|
118
124
|
# Do some sanity checks and save config to arsg.model_dir
|
|
119
125
|
configs = check_modify_and_save_config(args, configs)
|
|
@@ -122,6 +128,8 @@ def main():
|
|
|
122
128
|
writer = init_summarywriter(args)
|
|
123
129
|
|
|
124
130
|
# load checkpoint
|
|
131
|
+
if args.dpo is True:
|
|
132
|
+
configs[args.model].forward = configs[args.model].forward_dpo
|
|
125
133
|
model = configs[args.model]
|
|
126
134
|
start_step, start_epoch = 0, -1
|
|
127
135
|
if args.checkpoint is not None:
|
|
@@ -150,13 +158,25 @@ def main():
|
|
|
150
158
|
info_dict['epoch'] = start_epoch
|
|
151
159
|
save_model(model, 'init', info_dict)
|
|
152
160
|
|
|
161
|
+
# DPO related
|
|
162
|
+
if args.dpo is True:
|
|
163
|
+
ref_model = deepcopy(configs[args.model])
|
|
164
|
+
state_dict = torch.load(args.ref_model, map_location='cpu')
|
|
165
|
+
ref_model.load_state_dict(state_dict, strict=False)
|
|
166
|
+
dpo_loss = DPOLoss(beta=0.01, label_smoothing=0.0, ipo=False)
|
|
167
|
+
# NOTE maybe it is not needed to wrap ref_model as ddp because its parameter is not updated
|
|
168
|
+
ref_model = wrap_cuda_model(args, ref_model)
|
|
169
|
+
else:
|
|
170
|
+
ref_model, dpo_loss = None, None
|
|
171
|
+
|
|
153
172
|
# Get executor
|
|
154
|
-
executor = Executor(gan=gan)
|
|
173
|
+
executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
|
|
155
174
|
executor.step = start_step
|
|
156
175
|
|
|
157
176
|
# Init scaler, used for pytorch amp mixed precision training
|
|
158
177
|
scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
|
|
159
178
|
print('start step {} start epoch {}'.format(start_step, start_epoch))
|
|
179
|
+
|
|
160
180
|
# Start training loop
|
|
161
181
|
for epoch in range(start_epoch + 1, info_dict['max_epoch']):
|
|
162
182
|
executor.epoch = epoch
|
|
@@ -167,7 +187,7 @@ def main():
|
|
|
167
187
|
executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
|
|
168
188
|
writer, info_dict, scaler, group_join)
|
|
169
189
|
else:
|
|
170
|
-
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join)
|
|
190
|
+
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=ref_model)
|
|
171
191
|
dist.destroy_process_group(group_join)
|
|
172
192
|
|
|
173
193
|
|
|
@@ -26,7 +26,7 @@ from cosyvoice.utils.class_utils import get_model_type
|
|
|
26
26
|
|
|
27
27
|
class CosyVoice:
|
|
28
28
|
|
|
29
|
-
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
|
|
29
|
+
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
|
|
30
30
|
self.instruct = True if '-Instruct' in model_dir else False
|
|
31
31
|
self.model_dir = model_dir
|
|
32
32
|
self.fp16 = fp16
|
|
@@ -59,6 +59,7 @@ class CosyVoice:
|
|
|
59
59
|
if load_trt:
|
|
60
60
|
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
|
61
61
|
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
|
|
62
|
+
trt_concurrent,
|
|
62
63
|
self.fp16)
|
|
63
64
|
del configs
|
|
64
65
|
|
|
@@ -140,7 +141,7 @@ class CosyVoice:
|
|
|
140
141
|
|
|
141
142
|
class CosyVoice2(CosyVoice):
|
|
142
143
|
|
|
143
|
-
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False,
|
|
144
|
+
def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
|
|
144
145
|
self.instruct = True if '-Instruct' in model_dir else False
|
|
145
146
|
self.model_dir = model_dir
|
|
146
147
|
self.fp16 = fp16
|
|
@@ -162,15 +163,18 @@ class CosyVoice2(CosyVoice):
|
|
|
162
163
|
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
|
163
164
|
load_jit, load_trt, fp16 = False, False, False
|
|
164
165
|
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
|
165
|
-
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16
|
|
166
|
+
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
|
166
167
|
self.model.load('{}/llm.pt'.format(model_dir),
|
|
167
|
-
'{}/flow.pt'.format(model_dir)
|
|
168
|
+
'{}/flow.pt'.format(model_dir),
|
|
168
169
|
'{}/hift.pt'.format(model_dir))
|
|
170
|
+
if load_vllm:
|
|
171
|
+
self.model.load_vllm('{}/vllm'.format(model_dir))
|
|
169
172
|
if load_jit:
|
|
170
173
|
self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
|
|
171
174
|
if load_trt:
|
|
172
175
|
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
|
173
176
|
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
|
|
177
|
+
trt_concurrent,
|
|
174
178
|
self.fp16)
|
|
175
179
|
del configs
|
|
176
180
|
|
|
@@ -28,9 +28,9 @@ try:
|
|
|
28
28
|
import ttsfrd
|
|
29
29
|
use_ttsfrd = True
|
|
30
30
|
except ImportError:
|
|
31
|
-
print("failed to import ttsfrd, use
|
|
32
|
-
from
|
|
33
|
-
from
|
|
31
|
+
print("failed to import ttsfrd, use wetext instead")
|
|
32
|
+
from wetext import Normalizer as ZhNormalizer
|
|
33
|
+
from wetext import Normalizer as EnNormalizer
|
|
34
34
|
use_ttsfrd = False
|
|
35
35
|
from cosyvoice.utils.file_utils import logging
|
|
36
36
|
from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
|
|
@@ -68,7 +68,7 @@ class CosyVoiceFrontEnd:
|
|
|
68
68
|
'failed to initialize ttsfrd resource'
|
|
69
69
|
self.frd.set_lang_type('pinyinvg')
|
|
70
70
|
else:
|
|
71
|
-
self.zh_tn_model = ZhNormalizer(remove_erhua=False
|
|
71
|
+
self.zh_tn_model = ZhNormalizer(remove_erhua=False)
|
|
72
72
|
self.en_tn_model = EnNormalizer()
|
|
73
73
|
self.inflect_parser = inflect.engine()
|
|
74
74
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
2
|
+
# 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
|
|
2
3
|
#
|
|
3
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
5
|
# you may not use this file except in compliance with the License.
|
|
@@ -21,7 +22,8 @@ from torch.nn import functional as F
|
|
|
21
22
|
from contextlib import nullcontext
|
|
22
23
|
import uuid
|
|
23
24
|
from cosyvoice.utils.common import fade_in_out
|
|
24
|
-
from cosyvoice.utils.file_utils import convert_onnx_to_trt
|
|
25
|
+
from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
|
|
26
|
+
from cosyvoice.utils.common import TrtContextWrapper
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
class CosyVoiceModel:
|
|
@@ -80,30 +82,28 @@ class CosyVoiceModel:
|
|
|
80
82
|
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
|
81
83
|
self.flow.encoder = flow_encoder
|
|
82
84
|
|
|
83
|
-
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
|
|
85
|
+
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, trt_concurrent, fp16):
|
|
84
86
|
assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
|
|
85
|
-
if not os.path.exists(flow_decoder_estimator_model):
|
|
87
|
+
if not os.path.exists(flow_decoder_estimator_model) or os.path.getsize(flow_decoder_estimator_model) == 0:
|
|
86
88
|
convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
|
|
87
|
-
if os.path.getsize(flow_decoder_estimator_model) == 0:
|
|
88
|
-
raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
|
|
89
89
|
del self.flow.decoder.estimator
|
|
90
90
|
import tensorrt as trt
|
|
91
91
|
with open(flow_decoder_estimator_model, 'rb') as f:
|
|
92
|
-
|
|
93
|
-
assert
|
|
94
|
-
self.flow.decoder.estimator = self.
|
|
92
|
+
estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
|
|
93
|
+
assert estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
|
|
94
|
+
self.flow.decoder.estimator = TrtContextWrapper(estimator_engine, trt_concurrent=trt_concurrent, device=self.device)
|
|
95
95
|
|
|
96
96
|
def get_trt_kwargs(self):
|
|
97
97
|
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
|
|
98
|
-
opt_shape = [(2, 80,
|
|
98
|
+
opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
|
|
99
99
|
max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
|
|
100
100
|
input_names = ["x", "mask", "mu", "cond"]
|
|
101
101
|
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
|
|
102
102
|
|
|
103
103
|
def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
|
|
104
|
-
with self.llm_context, torch.cuda.amp.autocast(self.fp16):
|
|
104
|
+
with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
|
|
105
105
|
if isinstance(text, Generator):
|
|
106
|
-
assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
|
|
106
|
+
assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
|
|
107
107
|
for i in self.llm.inference_bistream(text=text,
|
|
108
108
|
prompt_text=prompt_text.to(self.device),
|
|
109
109
|
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
|
|
@@ -118,7 +118,8 @@ class CosyVoiceModel:
|
|
|
118
118
|
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
|
|
119
119
|
prompt_speech_token=llm_prompt_speech_token.to(self.device),
|
|
120
120
|
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
121
|
-
embedding=llm_embedding.to(self.device)
|
|
121
|
+
embedding=llm_embedding.to(self.device),
|
|
122
|
+
uuid=uuid):
|
|
122
123
|
self.tts_speech_token_dict[uuid].append(i)
|
|
123
124
|
self.llm_end_dict[uuid] = True
|
|
124
125
|
|
|
@@ -231,7 +232,9 @@ class CosyVoiceModel:
|
|
|
231
232
|
self.mel_overlap_dict.pop(this_uuid)
|
|
232
233
|
self.hift_cache_dict.pop(this_uuid)
|
|
233
234
|
self.flow_cache_dict.pop(this_uuid)
|
|
234
|
-
torch.cuda.
|
|
235
|
+
if torch.cuda.is_available():
|
|
236
|
+
torch.cuda.empty_cache()
|
|
237
|
+
torch.cuda.current_stream().synchronize()
|
|
235
238
|
|
|
236
239
|
|
|
237
240
|
class CosyVoice2Model(CosyVoiceModel):
|
|
@@ -240,20 +243,17 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
|
240
243
|
llm: torch.nn.Module,
|
|
241
244
|
flow: torch.nn.Module,
|
|
242
245
|
hift: torch.nn.Module,
|
|
243
|
-
fp16: bool = False
|
|
244
|
-
use_flow_cache: bool = False):
|
|
246
|
+
fp16: bool = False):
|
|
245
247
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
246
248
|
self.llm = llm
|
|
247
249
|
self.flow = flow
|
|
248
250
|
self.hift = hift
|
|
249
251
|
self.fp16 = fp16
|
|
250
|
-
self.use_flow_cache = use_flow_cache
|
|
251
252
|
if self.fp16 is True:
|
|
252
253
|
self.llm.half()
|
|
253
254
|
self.flow.half()
|
|
254
|
-
#
|
|
255
|
+
# NOTE must matching training static_chunk_size
|
|
255
256
|
self.token_hop_len = 25
|
|
256
|
-
self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
|
|
257
257
|
# hift cache
|
|
258
258
|
self.mel_cache_len = 8
|
|
259
259
|
self.source_cache_len = int(self.mel_cache_len * 480)
|
|
@@ -265,55 +265,35 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
|
265
265
|
# dict used to store session related variable
|
|
266
266
|
self.tts_speech_token_dict = {}
|
|
267
267
|
self.llm_end_dict = {}
|
|
268
|
-
self.flow_cache_dict = {}
|
|
269
268
|
self.hift_cache_dict = {}
|
|
270
269
|
|
|
271
|
-
def init_flow_cache(self):
|
|
272
|
-
encoder_cache = {'offset': 0,
|
|
273
|
-
'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
|
|
274
|
-
'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
|
|
275
|
-
'upsample_offset': 0,
|
|
276
|
-
'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
|
|
277
|
-
'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
|
|
278
|
-
decoder_cache = {'offset': 0,
|
|
279
|
-
'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
|
|
280
|
-
'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
281
|
-
'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
|
|
282
|
-
'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
283
|
-
'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
|
|
284
|
-
'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
285
|
-
'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
|
|
286
|
-
if self.fp16 is True:
|
|
287
|
-
for cache in [encoder_cache, decoder_cache]:
|
|
288
|
-
for k, v in cache.items():
|
|
289
|
-
if isinstance(v, torch.Tensor):
|
|
290
|
-
cache[k] = v.half()
|
|
291
|
-
cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
|
|
292
|
-
return cache
|
|
293
|
-
|
|
294
270
|
def load_jit(self, flow_encoder_model):
|
|
295
271
|
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
|
296
272
|
self.flow.encoder = flow_encoder
|
|
297
273
|
|
|
298
|
-
def
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
274
|
+
def load_vllm(self, model_dir):
|
|
275
|
+
export_cosyvoice2_vllm(self.llm, model_dir, self.device)
|
|
276
|
+
from vllm import EngineArgs, LLMEngine
|
|
277
|
+
engine_args = EngineArgs(model=model_dir,
|
|
278
|
+
skip_tokenizer_init=True,
|
|
279
|
+
enable_prompt_embeds=True,
|
|
280
|
+
gpu_memory_utilization=0.2)
|
|
281
|
+
self.llm.vllm = LLMEngine.from_engine_args(engine_args)
|
|
282
|
+
self.llm.lock = threading.Lock()
|
|
283
|
+
del self.llm.llm.model.model.layers
|
|
305
284
|
|
|
306
|
-
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
|
|
285
|
+
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
|
|
307
286
|
with torch.cuda.amp.autocast(self.fp16):
|
|
308
|
-
tts_mel,
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
287
|
+
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
|
288
|
+
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
|
289
|
+
prompt_token=prompt_token.to(self.device),
|
|
290
|
+
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
291
|
+
prompt_feat=prompt_feat.to(self.device),
|
|
292
|
+
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
|
293
|
+
embedding=embedding.to(self.device),
|
|
294
|
+
streaming=stream,
|
|
295
|
+
finalize=finalize)
|
|
296
|
+
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
|
317
297
|
# append hift cache
|
|
318
298
|
if self.hift_cache_dict[uuid] is not None:
|
|
319
299
|
hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
|
|
@@ -348,34 +328,30 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
|
348
328
|
with self.lock:
|
|
349
329
|
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
|
|
350
330
|
self.hift_cache_dict[this_uuid] = None
|
|
351
|
-
self.flow_cache_dict[this_uuid] = self.init_flow_cache()
|
|
352
331
|
if source_speech_token.shape[1] == 0:
|
|
353
332
|
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
|
|
354
333
|
else:
|
|
355
334
|
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
|
|
356
335
|
p.start()
|
|
357
336
|
if stream is True:
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
|
|
361
|
-
prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
|
|
337
|
+
token_offset = 0
|
|
338
|
+
prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
|
|
362
339
|
while True:
|
|
363
340
|
time.sleep(0.1)
|
|
364
|
-
|
|
365
|
-
|
|
341
|
+
this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len
|
|
342
|
+
if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len:
|
|
343
|
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
|
|
366
344
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
367
345
|
prompt_token=flow_prompt_speech_token,
|
|
368
346
|
prompt_feat=prompt_speech_feat,
|
|
369
347
|
embedding=flow_embedding,
|
|
348
|
+
token_offset=token_offset,
|
|
370
349
|
uuid=this_uuid,
|
|
350
|
+
stream=stream,
|
|
371
351
|
finalize=False)
|
|
372
|
-
|
|
373
|
-
flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
|
|
374
|
-
prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
|
|
352
|
+
token_offset += this_token_hop_len
|
|
375
353
|
yield {'tts_speech': this_tts_speech.cpu()}
|
|
376
|
-
|
|
377
|
-
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
|
|
378
|
-
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
|
|
354
|
+
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
|
|
379
355
|
break
|
|
380
356
|
p.join()
|
|
381
357
|
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
|
@@ -384,18 +360,19 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
|
384
360
|
prompt_token=flow_prompt_speech_token,
|
|
385
361
|
prompt_feat=prompt_speech_feat,
|
|
386
362
|
embedding=flow_embedding,
|
|
363
|
+
token_offset=token_offset,
|
|
387
364
|
uuid=this_uuid,
|
|
388
365
|
finalize=True)
|
|
389
366
|
yield {'tts_speech': this_tts_speech.cpu()}
|
|
390
367
|
else:
|
|
391
368
|
# deal with all tokens
|
|
392
|
-
assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
|
|
393
369
|
p.join()
|
|
394
370
|
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
|
395
371
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
396
372
|
prompt_token=flow_prompt_speech_token,
|
|
397
373
|
prompt_feat=prompt_speech_feat,
|
|
398
374
|
embedding=flow_embedding,
|
|
375
|
+
token_offset=0,
|
|
399
376
|
uuid=this_uuid,
|
|
400
377
|
finalize=True,
|
|
401
378
|
speed=speed)
|
|
@@ -404,5 +381,6 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
|
404
381
|
self.tts_speech_token_dict.pop(this_uuid)
|
|
405
382
|
self.llm_end_dict.pop(this_uuid)
|
|
406
383
|
self.hift_cache_dict.pop(this_uuid)
|
|
407
|
-
|
|
408
|
-
|
|
384
|
+
if torch.cuda.is_available():
|
|
385
|
+
torch.cuda.empty_cache()
|
|
386
|
+
torch.cuda.current_stream().synchronize()
|