xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/model.py +3 -4
  5. xinference/core/supervisor.py +29 -1
  6. xinference/core/worker.py +4 -1
  7. xinference/deploy/cmdline.py +2 -0
  8. xinference/deploy/test/test_cmdline.py +1 -1
  9. xinference/model/audio/core.py +5 -0
  10. xinference/model/audio/cosyvoice.py +0 -1
  11. xinference/model/audio/kokoro.py +1 -1
  12. xinference/model/audio/kokoro_zh.py +124 -0
  13. xinference/model/audio/model_spec.json +64 -20
  14. xinference/model/embedding/flag/core.py +5 -0
  15. xinference/model/embedding/llama_cpp/core.py +22 -19
  16. xinference/model/embedding/sentence_transformers/core.py +19 -4
  17. xinference/model/embedding/vllm/core.py +40 -8
  18. xinference/model/image/cache_manager.py +56 -0
  19. xinference/model/image/core.py +9 -0
  20. xinference/model/image/model_spec.json +116 -9
  21. xinference/model/image/stable_diffusion/core.py +141 -31
  22. xinference/model/llm/core.py +10 -0
  23. xinference/model/llm/llama_cpp/core.py +42 -40
  24. xinference/model/llm/llm_family.json +435 -23
  25. xinference/model/llm/llm_family.py +1 -0
  26. xinference/model/llm/mlx/core.py +52 -33
  27. xinference/model/llm/sglang/core.py +2 -44
  28. xinference/model/llm/tool_parsers/__init__.py +58 -0
  29. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  30. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  31. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  32. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  33. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  34. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  35. xinference/model/llm/transformers/core.py +6 -12
  36. xinference/model/llm/utils.py +128 -46
  37. xinference/model/llm/vllm/core.py +8 -61
  38. xinference/model/rerank/core.py +3 -0
  39. xinference/model/rerank/sentence_transformers/core.py +1 -1
  40. xinference/model/rerank/vllm/core.py +56 -6
  41. xinference/model/utils.py +1 -2
  42. xinference/model/video/model_spec.json +95 -1
  43. xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
  44. xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
  45. xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
  46. xinference/thirdparty/cosyvoice/bin/train.py +23 -3
  47. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
  48. xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
  49. xinference/thirdparty/cosyvoice/cli/model.py +53 -75
  50. xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
  51. xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
  52. xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
  53. xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
  54. xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
  55. xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
  56. xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
  57. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
  58. xinference/thirdparty/cosyvoice/utils/common.py +20 -0
  59. xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
  60. xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
  61. xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
  62. xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
  63. xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
  64. xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
  65. xinference/types.py +105 -2
  66. xinference/ui/gradio/chat_interface.py +2 -0
  67. xinference/ui/gradio/media_interface.py +353 -7
  68. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  69. xinference/ui/web/ui/build/index.html +1 -1
  70. xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
  71. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
  72. xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
  73. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
  74. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
  75. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
  76. xinference/ui/web/ui/src/locales/en.json +2 -0
  77. xinference/ui/web/ui/src/locales/ja.json +2 -0
  78. xinference/ui/web/ui/src/locales/ko.json +2 -0
  79. xinference/ui/web/ui/src/locales/zh.json +2 -0
  80. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
  81. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
  82. xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
  83. xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
  84. xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
  85. xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
  86. xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
  87. xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
  88. /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
  89. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  90. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  91. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  92. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -62,135 +62,58 @@ def main():
62
62
  model = CosyVoice(args.model_dir)
63
63
  except Exception:
64
64
  try:
65
- # NOTE set use_flow_cache=True when export jit for cache inference
66
- model = CosyVoice2(args.model_dir, use_flow_cache=True)
65
+ model = CosyVoice2(args.model_dir)
67
66
  except Exception:
68
67
  raise TypeError('no valid model_type!')
69
68
 
70
- if not isinstance(model, CosyVoice2):
71
- # 1. export flow decoder estimator
72
- estimator = model.model.flow.decoder.estimator
73
- estimator.eval()
74
-
75
- device = model.model.device
76
- batch_size, seq_len = 2, 256
77
- out_channels = model.model.flow.decoder.estimator.out_channels
78
- x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
79
- torch.onnx.export(
80
- estimator,
81
- (x, mask, mu, t, spks, cond),
82
- '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
83
- export_params=True,
84
- opset_version=18,
85
- do_constant_folding=True,
86
- input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
87
- output_names=['estimator_out'],
88
- dynamic_axes={
89
- 'x': {2: 'seq_len'},
90
- 'mask': {2: 'seq_len'},
91
- 'mu': {2: 'seq_len'},
92
- 'cond': {2: 'seq_len'},
93
- 'estimator_out': {2: 'seq_len'},
94
- }
95
- )
96
-
97
- # 2. test computation consistency
98
- option = onnxruntime.SessionOptions()
99
- option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
100
- option.intra_op_num_threads = 1
101
- providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
102
- estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
103
- sess_options=option, providers=providers)
104
-
105
- for _ in tqdm(range(10)):
106
- x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
107
- output_pytorch = estimator(x, mask, mu, t, spks, cond)
108
- ort_inputs = {
109
- 'x': x.cpu().numpy(),
110
- 'mask': mask.cpu().numpy(),
111
- 'mu': mu.cpu().numpy(),
112
- 't': t.cpu().numpy(),
113
- 'spks': spks.cpu().numpy(),
114
- 'cond': cond.cpu().numpy()
115
- }
116
- output_onnx = estimator_onnx.run(None, ort_inputs)[0]
117
- torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
118
- logging.info('successfully export estimator')
119
- else:
120
- # 1. export flow decoder estimator
121
- estimator = model.model.flow.decoder.estimator
122
- estimator.forward = estimator.forward_chunk
123
- estimator.eval()
124
-
125
- device = model.model.device
126
- batch_size, seq_len = 2, 256
127
- out_channels = model.model.flow.decoder.estimator.out_channels
128
- x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
129
- cache = model.model.init_flow_cache()['decoder_cache']
130
- cache.pop('offset')
131
- cache = {k: v[0] for k, v in cache.items()}
132
- torch.onnx.export(
133
- estimator,
134
- (x, mask, mu, t, spks, cond,
135
- cache['down_blocks_conv_cache'],
136
- cache['down_blocks_kv_cache'],
137
- cache['mid_blocks_conv_cache'],
138
- cache['mid_blocks_kv_cache'],
139
- cache['up_blocks_conv_cache'],
140
- cache['up_blocks_kv_cache'],
141
- cache['final_blocks_conv_cache']),
142
- '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
143
- export_params=True,
144
- opset_version=18,
145
- do_constant_folding=True,
146
- input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache',
147
- 'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
148
- output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out',
149
- 'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
150
- dynamic_axes={
151
- 'x': {2: 'seq_len'},
152
- 'mask': {2: 'seq_len'},
153
- 'mu': {2: 'seq_len'},
154
- 'cond': {2: 'seq_len'},
155
- 'down_blocks_kv_cache': {3: 'cache_in_len'},
156
- 'mid_blocks_kv_cache': {3: 'cache_in_len'},
157
- 'up_blocks_kv_cache': {3: 'cache_in_len'},
158
- 'estimator_out': {2: 'seq_len'},
159
- 'down_blocks_kv_cache_out': {3: 'cache_out_len'},
160
- 'mid_blocks_kv_cache_out': {3: 'cache_out_len'},
161
- 'up_blocks_kv_cache_out': {3: 'cache_out_len'},
162
- }
163
- )
164
-
165
- # 2. test computation consistency
166
- option = onnxruntime.SessionOptions()
167
- option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
168
- option.intra_op_num_threads = 1
169
- providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
170
- estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
171
- sess_options=option, providers=providers)
172
-
173
- for iter in tqdm(range(10)):
174
- x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
175
- cache = model.model.init_flow_cache()['decoder_cache']
176
- cache.pop('offset')
177
- cache = {k: v[0] for k, v in cache.items()}
178
- output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
179
- ort_inputs = {
180
- 'x': x.cpu().numpy(),
181
- 'mask': mask.cpu().numpy(),
182
- 'mu': mu.cpu().numpy(),
183
- 't': t.cpu().numpy(),
184
- 'spks': spks.cpu().numpy(),
185
- 'cond': cond.cpu().numpy(),
186
- }
187
- output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
188
- if iter == 0:
189
- # NOTE why can not pass first iteration check?
190
- continue
191
- for i, j in zip(output_pytorch, output_onnx):
192
- torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
193
- logging.info('successfully export estimator')
69
+ # 1. export flow decoder estimator
70
+ estimator = model.model.flow.decoder.estimator
71
+ estimator.eval()
72
+
73
+ device = model.model.device
74
+ batch_size, seq_len = 2, 256
75
+ out_channels = model.model.flow.decoder.estimator.out_channels
76
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
77
+ torch.onnx.export(
78
+ estimator,
79
+ (x, mask, mu, t, spks, cond),
80
+ '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
81
+ export_params=True,
82
+ opset_version=18,
83
+ do_constant_folding=True,
84
+ input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
85
+ output_names=['estimator_out'],
86
+ dynamic_axes={
87
+ 'x': {2: 'seq_len'},
88
+ 'mask': {2: 'seq_len'},
89
+ 'mu': {2: 'seq_len'},
90
+ 'cond': {2: 'seq_len'},
91
+ 'estimator_out': {2: 'seq_len'},
92
+ }
93
+ )
94
+
95
+ # 2. test computation consistency
96
+ option = onnxruntime.SessionOptions()
97
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
98
+ option.intra_op_num_threads = 1
99
+ providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
100
+ estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
101
+ sess_options=option, providers=providers)
102
+
103
+ for _ in tqdm(range(10)):
104
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
105
+ output_pytorch = estimator(x, mask, mu, t, spks, cond)
106
+ ort_inputs = {
107
+ 'x': x.cpu().numpy(),
108
+ 'mask': mask.cpu().numpy(),
109
+ 'mu': mu.cpu().numpy(),
110
+ 't': t.cpu().numpy(),
111
+ 'spks': spks.cpu().numpy(),
112
+ 'cond': cond.cpu().numpy()
113
+ }
114
+ output_onnx = estimator_onnx.run(None, ort_inputs)[0]
115
+ torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
116
+ logging.info('successfully export estimator')
194
117
 
195
118
 
196
119
  if __name__ == "__main__":
@@ -122,4 +122,5 @@ def main():
122
122
 
123
123
 
124
124
  if __name__ == '__main__':
125
+ logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!')
125
126
  main()
@@ -27,6 +27,7 @@ from hyperpyyaml import load_hyperpyyaml
27
27
 
28
28
  from torch.distributed.elastic.multiprocessing.errors import record
29
29
 
30
+ from cosyvoice.utils.losses import DPOLoss
30
31
  from cosyvoice.utils.executor import Executor
31
32
  from cosyvoice.utils.train_utils import (
32
33
  init_distributed,
@@ -43,6 +44,7 @@ def get_args():
43
44
  choices=['torch_ddp', 'deepspeed'],
44
45
  help='Engine for paralleled training')
45
46
  parser.add_argument('--model', required=True, help='model which will be trained')
47
+ parser.add_argument('--ref_model', required=False, help='ref model used in dpo')
46
48
  parser.add_argument('--config', required=True, help='config file')
47
49
  parser.add_argument('--train_data', required=True, help='train data file')
48
50
  parser.add_argument('--cv_data', required=True, help='cv data file')
@@ -73,6 +75,10 @@ def get_args():
73
75
  action='store_true',
74
76
  default=False,
75
77
  help='Use automatic mixed precision training')
78
+ parser.add_argument('--dpo',
79
+ action='store_true',
80
+ default=False,
81
+ help='Use Direct Preference Optimization')
76
82
  parser.add_argument('--deepspeed.save_states',
77
83
  dest='save_states',
78
84
  default='model_only',
@@ -113,7 +119,7 @@ def main():
113
119
 
114
120
  # Get dataset & dataloader
115
121
  train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
116
- init_dataset_and_dataloader(args, configs, gan)
122
+ init_dataset_and_dataloader(args, configs, gan, args.dpo)
117
123
 
118
124
  # Do some sanity checks and save config to arsg.model_dir
119
125
  configs = check_modify_and_save_config(args, configs)
@@ -122,6 +128,8 @@ def main():
122
128
  writer = init_summarywriter(args)
123
129
 
124
130
  # load checkpoint
131
+ if args.dpo is True:
132
+ configs[args.model].forward = configs[args.model].forward_dpo
125
133
  model = configs[args.model]
126
134
  start_step, start_epoch = 0, -1
127
135
  if args.checkpoint is not None:
@@ -150,13 +158,25 @@ def main():
150
158
  info_dict['epoch'] = start_epoch
151
159
  save_model(model, 'init', info_dict)
152
160
 
161
+ # DPO related
162
+ if args.dpo is True:
163
+ ref_model = deepcopy(configs[args.model])
164
+ state_dict = torch.load(args.ref_model, map_location='cpu')
165
+ ref_model.load_state_dict(state_dict, strict=False)
166
+ dpo_loss = DPOLoss(beta=0.01, label_smoothing=0.0, ipo=False)
167
+ # NOTE maybe it is not needed to wrap ref_model as ddp because its parameter is not updated
168
+ ref_model = wrap_cuda_model(args, ref_model)
169
+ else:
170
+ ref_model, dpo_loss = None, None
171
+
153
172
  # Get executor
154
- executor = Executor(gan=gan)
173
+ executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
155
174
  executor.step = start_step
156
175
 
157
176
  # Init scaler, used for pytorch amp mixed precision training
158
177
  scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
159
178
  print('start step {} start epoch {}'.format(start_step, start_epoch))
179
+
160
180
  # Start training loop
161
181
  for epoch in range(start_epoch + 1, info_dict['max_epoch']):
162
182
  executor.epoch = epoch
@@ -167,7 +187,7 @@ def main():
167
187
  executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
168
188
  writer, info_dict, scaler, group_join)
169
189
  else:
170
- executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join)
190
+ executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=ref_model)
171
191
  dist.destroy_process_group(group_join)
172
192
 
173
193
 
@@ -26,7 +26,7 @@ from cosyvoice.utils.class_utils import get_model_type
26
26
 
27
27
  class CosyVoice:
28
28
 
29
- def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
29
+ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
30
30
  self.instruct = True if '-Instruct' in model_dir else False
31
31
  self.model_dir = model_dir
32
32
  self.fp16 = fp16
@@ -59,6 +59,7 @@ class CosyVoice:
59
59
  if load_trt:
60
60
  self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
61
61
  '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
62
+ trt_concurrent,
62
63
  self.fp16)
63
64
  del configs
64
65
 
@@ -140,7 +141,7 @@ class CosyVoice:
140
141
 
141
142
  class CosyVoice2(CosyVoice):
142
143
 
143
- def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, use_flow_cache=False):
144
+ def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
144
145
  self.instruct = True if '-Instruct' in model_dir else False
145
146
  self.model_dir = model_dir
146
147
  self.fp16 = fp16
@@ -162,15 +163,18 @@ class CosyVoice2(CosyVoice):
162
163
  if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
163
164
  load_jit, load_trt, fp16 = False, False, False
164
165
  logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
165
- self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16, use_flow_cache)
166
+ self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
166
167
  self.model.load('{}/llm.pt'.format(model_dir),
167
- '{}/flow.pt'.format(model_dir) if use_flow_cache is False else '{}/flow.cache.pt'.format(model_dir),
168
+ '{}/flow.pt'.format(model_dir),
168
169
  '{}/hift.pt'.format(model_dir))
170
+ if load_vllm:
171
+ self.model.load_vllm('{}/vllm'.format(model_dir))
169
172
  if load_jit:
170
173
  self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
171
174
  if load_trt:
172
175
  self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
173
176
  '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
177
+ trt_concurrent,
174
178
  self.fp16)
175
179
  del configs
176
180
 
@@ -28,9 +28,9 @@ try:
28
28
  import ttsfrd
29
29
  use_ttsfrd = True
30
30
  except ImportError:
31
- print("failed to import ttsfrd, use WeTextProcessing instead")
32
- from tn.chinese.normalizer import Normalizer as ZhNormalizer
33
- from tn.english.normalizer import Normalizer as EnNormalizer
31
+ print("failed to import ttsfrd, use wetext instead")
32
+ from wetext import Normalizer as ZhNormalizer
33
+ from wetext import Normalizer as EnNormalizer
34
34
  use_ttsfrd = False
35
35
  from cosyvoice.utils.file_utils import logging
36
36
  from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
@@ -68,7 +68,7 @@ class CosyVoiceFrontEnd:
68
68
  'failed to initialize ttsfrd resource'
69
69
  self.frd.set_lang_type('pinyinvg')
70
70
  else:
71
- self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
71
+ self.zh_tn_model = ZhNormalizer(remove_erhua=False)
72
72
  self.en_tn_model = EnNormalizer()
73
73
  self.inflect_parser = inflect.engine()
74
74
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2
+ # 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
2
3
  #
3
4
  # Licensed under the Apache License, Version 2.0 (the "License");
4
5
  # you may not use this file except in compliance with the License.
@@ -21,7 +22,8 @@ from torch.nn import functional as F
21
22
  from contextlib import nullcontext
22
23
  import uuid
23
24
  from cosyvoice.utils.common import fade_in_out
24
- from cosyvoice.utils.file_utils import convert_onnx_to_trt
25
+ from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
26
+ from cosyvoice.utils.common import TrtContextWrapper
25
27
 
26
28
 
27
29
  class CosyVoiceModel:
@@ -80,30 +82,28 @@ class CosyVoiceModel:
80
82
  flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
81
83
  self.flow.encoder = flow_encoder
82
84
 
83
- def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
85
+ def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, trt_concurrent, fp16):
84
86
  assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
85
- if not os.path.exists(flow_decoder_estimator_model):
87
+ if not os.path.exists(flow_decoder_estimator_model) or os.path.getsize(flow_decoder_estimator_model) == 0:
86
88
  convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
87
- if os.path.getsize(flow_decoder_estimator_model) == 0:
88
- raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
89
89
  del self.flow.decoder.estimator
90
90
  import tensorrt as trt
91
91
  with open(flow_decoder_estimator_model, 'rb') as f:
92
- self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
93
- assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
94
- self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
92
+ estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
93
+ assert estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
94
+ self.flow.decoder.estimator = TrtContextWrapper(estimator_engine, trt_concurrent=trt_concurrent, device=self.device)
95
95
 
96
96
  def get_trt_kwargs(self):
97
97
  min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
98
- opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
98
+ opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
99
99
  max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
100
100
  input_names = ["x", "mask", "mu", "cond"]
101
101
  return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
102
102
 
103
103
  def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
104
- with self.llm_context, torch.cuda.amp.autocast(self.fp16):
104
+ with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
105
105
  if isinstance(text, Generator):
106
- assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
106
+ assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
107
107
  for i in self.llm.inference_bistream(text=text,
108
108
  prompt_text=prompt_text.to(self.device),
109
109
  prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
@@ -118,7 +118,8 @@ class CosyVoiceModel:
118
118
  prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
119
119
  prompt_speech_token=llm_prompt_speech_token.to(self.device),
120
120
  prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
121
- embedding=llm_embedding.to(self.device)):
121
+ embedding=llm_embedding.to(self.device),
122
+ uuid=uuid):
122
123
  self.tts_speech_token_dict[uuid].append(i)
123
124
  self.llm_end_dict[uuid] = True
124
125
 
@@ -231,7 +232,9 @@ class CosyVoiceModel:
231
232
  self.mel_overlap_dict.pop(this_uuid)
232
233
  self.hift_cache_dict.pop(this_uuid)
233
234
  self.flow_cache_dict.pop(this_uuid)
234
- torch.cuda.empty_cache()
235
+ if torch.cuda.is_available():
236
+ torch.cuda.empty_cache()
237
+ torch.cuda.current_stream().synchronize()
235
238
 
236
239
 
237
240
  class CosyVoice2Model(CosyVoiceModel):
@@ -240,20 +243,17 @@ class CosyVoice2Model(CosyVoiceModel):
240
243
  llm: torch.nn.Module,
241
244
  flow: torch.nn.Module,
242
245
  hift: torch.nn.Module,
243
- fp16: bool = False,
244
- use_flow_cache: bool = False):
246
+ fp16: bool = False):
245
247
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
246
248
  self.llm = llm
247
249
  self.flow = flow
248
250
  self.hift = hift
249
251
  self.fp16 = fp16
250
- self.use_flow_cache = use_flow_cache
251
252
  if self.fp16 is True:
252
253
  self.llm.half()
253
254
  self.flow.half()
254
- # stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
255
+ # NOTE must matching training static_chunk_size
255
256
  self.token_hop_len = 25
256
- self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
257
257
  # hift cache
258
258
  self.mel_cache_len = 8
259
259
  self.source_cache_len = int(self.mel_cache_len * 480)
@@ -265,55 +265,35 @@ class CosyVoice2Model(CosyVoiceModel):
265
265
  # dict used to store session related variable
266
266
  self.tts_speech_token_dict = {}
267
267
  self.llm_end_dict = {}
268
- self.flow_cache_dict = {}
269
268
  self.hift_cache_dict = {}
270
269
 
271
- def init_flow_cache(self):
272
- encoder_cache = {'offset': 0,
273
- 'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
274
- 'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
275
- 'upsample_offset': 0,
276
- 'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
277
- 'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
278
- decoder_cache = {'offset': 0,
279
- 'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
280
- 'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
281
- 'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
282
- 'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
283
- 'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
284
- 'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
285
- 'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
286
- if self.fp16 is True:
287
- for cache in [encoder_cache, decoder_cache]:
288
- for k, v in cache.items():
289
- if isinstance(v, torch.Tensor):
290
- cache[k] = v.half()
291
- cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
292
- return cache
293
-
294
270
  def load_jit(self, flow_encoder_model):
295
271
  flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
296
272
  self.flow.encoder = flow_encoder
297
273
 
298
- def get_trt_kwargs(self):
299
- min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
300
- opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
301
- max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
302
- input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
303
- assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
304
- return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
274
+ def load_vllm(self, model_dir):
275
+ export_cosyvoice2_vllm(self.llm, model_dir, self.device)
276
+ from vllm import EngineArgs, LLMEngine
277
+ engine_args = EngineArgs(model=model_dir,
278
+ skip_tokenizer_init=True,
279
+ enable_prompt_embeds=True,
280
+ gpu_memory_utilization=0.2)
281
+ self.llm.vllm = LLMEngine.from_engine_args(engine_args)
282
+ self.llm.lock = threading.Lock()
283
+ del self.llm.llm.model.model.layers
305
284
 
306
- def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
285
+ def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
307
286
  with torch.cuda.amp.autocast(self.fp16):
308
- tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
309
- token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
310
- prompt_token=prompt_token.to(self.device),
311
- prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
312
- prompt_feat=prompt_feat.to(self.device),
313
- prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
314
- embedding=embedding.to(self.device),
315
- cache=self.flow_cache_dict[uuid],
316
- finalize=finalize)
287
+ tts_mel, _ = self.flow.inference(token=token.to(self.device),
288
+ token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
289
+ prompt_token=prompt_token.to(self.device),
290
+ prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
291
+ prompt_feat=prompt_feat.to(self.device),
292
+ prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
293
+ embedding=embedding.to(self.device),
294
+ streaming=stream,
295
+ finalize=finalize)
296
+ tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
317
297
  # append hift cache
318
298
  if self.hift_cache_dict[uuid] is not None:
319
299
  hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
@@ -348,34 +328,30 @@ class CosyVoice2Model(CosyVoiceModel):
348
328
  with self.lock:
349
329
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
350
330
  self.hift_cache_dict[this_uuid] = None
351
- self.flow_cache_dict[this_uuid] = self.init_flow_cache()
352
331
  if source_speech_token.shape[1] == 0:
353
332
  p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
354
333
  else:
355
334
  p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
356
335
  p.start()
357
336
  if stream is True:
358
- assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
359
- # NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
360
- flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
361
- prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
337
+ token_offset = 0
338
+ prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
362
339
  while True:
363
340
  time.sleep(0.1)
364
- if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
365
- this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
341
+ this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len
342
+ if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len:
343
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
366
344
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
367
345
  prompt_token=flow_prompt_speech_token,
368
346
  prompt_feat=prompt_speech_feat,
369
347
  embedding=flow_embedding,
348
+ token_offset=token_offset,
370
349
  uuid=this_uuid,
350
+ stream=stream,
371
351
  finalize=False)
372
- # NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
373
- flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
374
- prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
352
+ token_offset += this_token_hop_len
375
353
  yield {'tts_speech': this_tts_speech.cpu()}
376
- with self.lock:
377
- self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
378
- if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
354
+ if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
379
355
  break
380
356
  p.join()
381
357
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
@@ -384,18 +360,19 @@ class CosyVoice2Model(CosyVoiceModel):
384
360
  prompt_token=flow_prompt_speech_token,
385
361
  prompt_feat=prompt_speech_feat,
386
362
  embedding=flow_embedding,
363
+ token_offset=token_offset,
387
364
  uuid=this_uuid,
388
365
  finalize=True)
389
366
  yield {'tts_speech': this_tts_speech.cpu()}
390
367
  else:
391
368
  # deal with all tokens
392
- assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
393
369
  p.join()
394
370
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
395
371
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
396
372
  prompt_token=flow_prompt_speech_token,
397
373
  prompt_feat=prompt_speech_feat,
398
374
  embedding=flow_embedding,
375
+ token_offset=0,
399
376
  uuid=this_uuid,
400
377
  finalize=True,
401
378
  speed=speed)
@@ -404,5 +381,6 @@ class CosyVoice2Model(CosyVoiceModel):
404
381
  self.tts_speech_token_dict.pop(this_uuid)
405
382
  self.llm_end_dict.pop(this_uuid)
406
383
  self.hift_cache_dict.pop(this_uuid)
407
- self.flow_cache_dict.pop(this_uuid)
408
- torch.cuda.empty_cache()
384
+ if torch.cuda.is_available():
385
+ torch.cuda.empty_cache()
386
+ torch.cuda.current_stream().synchronize()