xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/model.py +3 -4
  5. xinference/core/supervisor.py +29 -1
  6. xinference/core/worker.py +4 -1
  7. xinference/deploy/cmdline.py +2 -0
  8. xinference/deploy/test/test_cmdline.py +1 -1
  9. xinference/model/audio/core.py +5 -0
  10. xinference/model/audio/cosyvoice.py +0 -1
  11. xinference/model/audio/kokoro.py +1 -1
  12. xinference/model/audio/kokoro_zh.py +124 -0
  13. xinference/model/audio/model_spec.json +64 -20
  14. xinference/model/embedding/flag/core.py +5 -0
  15. xinference/model/embedding/llama_cpp/core.py +22 -19
  16. xinference/model/embedding/sentence_transformers/core.py +19 -4
  17. xinference/model/embedding/vllm/core.py +40 -8
  18. xinference/model/image/cache_manager.py +56 -0
  19. xinference/model/image/core.py +9 -0
  20. xinference/model/image/model_spec.json +116 -9
  21. xinference/model/image/stable_diffusion/core.py +141 -31
  22. xinference/model/llm/core.py +10 -0
  23. xinference/model/llm/llama_cpp/core.py +42 -40
  24. xinference/model/llm/llm_family.json +435 -23
  25. xinference/model/llm/llm_family.py +1 -0
  26. xinference/model/llm/mlx/core.py +52 -33
  27. xinference/model/llm/sglang/core.py +2 -44
  28. xinference/model/llm/tool_parsers/__init__.py +58 -0
  29. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  30. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  31. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  32. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  33. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  34. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  35. xinference/model/llm/transformers/core.py +6 -12
  36. xinference/model/llm/utils.py +128 -46
  37. xinference/model/llm/vllm/core.py +8 -61
  38. xinference/model/rerank/core.py +3 -0
  39. xinference/model/rerank/sentence_transformers/core.py +1 -1
  40. xinference/model/rerank/vllm/core.py +56 -6
  41. xinference/model/utils.py +1 -2
  42. xinference/model/video/model_spec.json +95 -1
  43. xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
  44. xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
  45. xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
  46. xinference/thirdparty/cosyvoice/bin/train.py +23 -3
  47. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
  48. xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
  49. xinference/thirdparty/cosyvoice/cli/model.py +53 -75
  50. xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
  51. xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
  52. xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
  53. xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
  54. xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
  55. xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
  56. xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
  57. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
  58. xinference/thirdparty/cosyvoice/utils/common.py +20 -0
  59. xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
  60. xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
  61. xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
  62. xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
  63. xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
  64. xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
  65. xinference/types.py +105 -2
  66. xinference/ui/gradio/chat_interface.py +2 -0
  67. xinference/ui/gradio/media_interface.py +353 -7
  68. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  69. xinference/ui/web/ui/build/index.html +1 -1
  70. xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
  71. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
  72. xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
  73. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
  74. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
  75. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
  76. xinference/ui/web/ui/src/locales/en.json +2 -0
  77. xinference/ui/web/ui/src/locales/ja.json +2 -0
  78. xinference/ui/web/ui/src/locales/ko.json +2 -0
  79. xinference/ui/web/ui/src/locales/zh.json +2 -0
  80. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
  81. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
  82. xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
  83. xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
  84. xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
  85. xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
  86. xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
  87. xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
  88. /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
  89. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  90. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  91. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  92. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -56,16 +56,11 @@ class Upsample1D(nn.Module):
56
56
  # In this mode, first repeat interpolate, than conv with stride=1
57
57
  self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
58
58
 
59
- def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor, conv_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
59
+ def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
60
60
  outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
61
- if conv_cache.size(2) == 0:
62
- outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
63
- else:
64
- assert conv_cache.size(2) == self.stride * 2
65
- outputs = torch.concat([conv_cache, outputs], dim=2)
66
- conv_cache_new = outputs[:, :, -self.stride * 2:]
61
+ outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
67
62
  outputs = self.conv(outputs)
68
- return outputs, input_lengths * self.stride, conv_cache_new
63
+ return outputs, input_lengths * self.stride
69
64
 
70
65
 
71
66
  class PreLookaheadLayer(nn.Module):
@@ -83,7 +78,7 @@ class PreLookaheadLayer(nn.Module):
83
78
  kernel_size=3, stride=1, padding=0,
84
79
  )
85
80
 
86
- def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0), conv2_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
81
+ def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
87
82
  """
88
83
  inputs: (batch_size, seq_len, channels)
89
84
  """
@@ -93,22 +88,18 @@ class PreLookaheadLayer(nn.Module):
93
88
  if context.size(2) == 0:
94
89
  outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
95
90
  else:
91
+ assert self.training is False, 'you have passed context, make sure that you are running inference mode'
96
92
  assert context.size(2) == self.pre_lookahead_len
97
93
  outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
98
94
  outputs = F.leaky_relu(self.conv1(outputs))
99
95
  # outputs
100
- if conv2_cache.size(2) == 0:
101
- outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
102
- else:
103
- assert conv2_cache.size(2) == self.conv2.kernel_size[0] - 1
104
- outputs = torch.concat([conv2_cache, outputs], dim=2)
105
- conv2_cache_new = outputs[:, :, -(self.conv2.kernel_size[0] - 1):]
96
+ outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
106
97
  outputs = self.conv2(outputs)
107
98
  outputs = outputs.transpose(1, 2).contiguous()
108
99
 
109
100
  # residual connection
110
101
  outputs = outputs + inputs
111
- return outputs, conv2_cache_new
102
+ return outputs
112
103
 
113
104
 
114
105
  class UpsampleConformerEncoder(torch.nn.Module):
@@ -253,6 +244,7 @@ class UpsampleConformerEncoder(torch.nn.Module):
253
244
  self,
254
245
  xs: torch.Tensor,
255
246
  xs_lens: torch.Tensor,
247
+ context: torch.Tensor = torch.zeros(0, 0, 0),
256
248
  decoding_chunk_size: int = 0,
257
249
  num_decoding_left_chunks: int = -1,
258
250
  streaming: bool = False,
@@ -285,15 +277,19 @@ class UpsampleConformerEncoder(torch.nn.Module):
285
277
  if self.global_cmvn is not None:
286
278
  xs = self.global_cmvn(xs)
287
279
  xs, pos_emb, masks = self.embed(xs, masks)
280
+ if context.size(1) != 0:
281
+ assert self.training is False, 'you have passed context, make sure that you are running inference mode'
282
+ context_masks = torch.ones(1, 1, context.size(1)).to(masks)
283
+ context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
288
284
  mask_pad = masks # (B, 1, T/subsample_rate)
289
285
  chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
290
286
  # lookahead + conformer encoder
291
- xs, _ = self.pre_lookahead_layer(xs)
287
+ xs = self.pre_lookahead_layer(xs, context=context)
292
288
  xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
293
289
 
294
290
  # upsample + conformer encoder
295
291
  xs = xs.transpose(1, 2).contiguous()
296
- xs, xs_lens, _ = self.up_layer(xs, xs_lens)
292
+ xs, xs_lens = self.up_layer(xs, xs_lens)
297
293
  xs = xs.transpose(1, 2).contiguous()
298
294
  T = xs.size(1)
299
295
  masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
@@ -322,100 +318,3 @@ class UpsampleConformerEncoder(torch.nn.Module):
322
318
  for layer in self.up_encoders:
323
319
  xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
324
320
  return xs
325
-
326
- @torch.jit.export
327
- def forward_chunk(
328
- self,
329
- xs: torch.Tensor,
330
- xs_lens: torch.Tensor,
331
- offset: int = 0,
332
- context: torch.Tensor = torch.zeros(0, 0, 0),
333
- pre_lookahead_layer_conv2_cache: torch.Tensor = torch.zeros(0, 0, 0),
334
- encoders_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0),
335
- upsample_offset: int = 0,
336
- upsample_conv_cache: torch.Tensor = torch.zeros(0, 0, 0),
337
- upsample_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0)
338
- ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[int, torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]]:
339
- """Embed positions in tensor.
340
-
341
- Args:
342
- xs: padded input tensor (B, T, D)
343
- xs_lens: input length (B)
344
- decoding_chunk_size: decoding chunk size for dynamic chunk
345
- 0: default for training, use random dynamic chunk.
346
- <0: for decoding, use full chunk.
347
- >0: for decoding, use fixed chunk size as set.
348
- num_decoding_left_chunks: number of left chunks, this is for decoding,
349
- the chunk size is decoding_chunk_size.
350
- >=0: use num_decoding_left_chunks
351
- <0: use all left chunks
352
- Returns:
353
- encoder output tensor xs, and subsampled masks
354
- xs: padded output tensor (B, T' ~= T/subsample_rate, D)
355
- masks: torch.Tensor batch padding mask after subsample
356
- (B, 1, T' ~= T/subsample_rate)
357
- NOTE(xcsong):
358
- We pass the `__call__` method of the modules instead of `forward` to the
359
- checkpointing API because `__call__` attaches all the hooks of the module.
360
- https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
361
- """
362
- assert xs.size(0) == 1
363
- # tmp_masks is just for interface compatibility
364
- tmp_masks = torch.ones(1,
365
- xs.size(1),
366
- device=xs.device,
367
- dtype=torch.bool)
368
- tmp_masks = tmp_masks.unsqueeze(1)
369
- if self.global_cmvn is not None:
370
- xs = self.global_cmvn(xs)
371
- # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
372
- xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
373
- offset += xs.size(1)
374
- tmp_masks = torch.ones(1,
375
- context.size(1),
376
- device=context.device,
377
- dtype=torch.bool)
378
- tmp_masks = tmp_masks.unsqueeze(1)
379
- if context.size(1) != 0:
380
- context, _, _ = self.embed(context, tmp_masks, offset)
381
-
382
- # lookahead + conformer encoder
383
- xs, pre_lookahead_layer_conv2_cache = self.pre_lookahead_layer(xs, context, pre_lookahead_layer_conv2_cache)
384
- # NOTE in cache mode we do not need to call add_optional_chunk_mask
385
- chunk_masks = torch.ones((1, xs.size(1), offset), dtype=torch.bool, device=xs.device)
386
- mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
387
- encoders_kv_cache_list = []
388
- for index, layer in enumerate(self.encoders):
389
- xs, chunk_masks, encoders_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, encoders_kv_cache[index])
390
- encoders_kv_cache_list.append(encoders_kv_cache_new)
391
- encoders_kv_cache = torch.stack(encoders_kv_cache_list, dim=0)
392
-
393
- # upsample
394
- xs = xs.transpose(1, 2).contiguous()
395
- xs, xs_lens, upsample_conv_cache = self.up_layer(xs, xs_lens, upsample_conv_cache)
396
- xs = xs.transpose(1, 2).contiguous()
397
-
398
- # tmp_masks is just for interface compatibility
399
- tmp_masks = torch.ones(1,
400
- xs.size(1),
401
- device=xs.device,
402
- dtype=torch.bool)
403
- tmp_masks = tmp_masks.unsqueeze(1)
404
- xs, pos_emb, masks = self.up_embed(xs, tmp_masks, upsample_offset)
405
- upsample_offset += xs.size(1)
406
-
407
- # conformer encoder
408
- chunk_masks = torch.ones((1, xs.size(1), upsample_offset), dtype=torch.bool, device=xs.device)
409
- mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
410
- upsample_kv_cache_list = []
411
- for index, layer in enumerate(self.up_encoders):
412
- xs, chunk_masks, upsample_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, upsample_kv_cache[index])
413
- upsample_kv_cache_list.append(upsample_kv_cache_new)
414
- upsample_kv_cache = torch.stack(upsample_kv_cache_list, dim=0)
415
-
416
- if self.normalize_before:
417
- xs = self.after_norm(xs)
418
- # Here we assume the mask is not changed in encoder layers, so just
419
- # return the masks before encoder layers, and the masks will be used
420
- # for cross attention with decoder later
421
- return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache, upsample_offset, upsample_conv_cache, upsample_kv_cache)
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
2
2
  # 2024 Alibaba Inc (authors: Xiang Lyu)
3
+ # 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
3
4
  #
4
5
  # Licensed under the Apache License, Version 2.0 (the "License");
5
6
  # you may not use this file except in compliance with the License.
@@ -15,6 +16,7 @@
15
16
  # Modified from ESPnet(https://github.com/espnet/espnet)
16
17
  """Unility functions for Transformer."""
17
18
 
19
+ import queue
18
20
  import random
19
21
  from typing import List
20
22
 
@@ -164,3 +166,21 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
164
166
  # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
165
167
  mask = (1.0 - mask) * -1.0e+10
166
168
  return mask
169
+
170
+
171
+ class TrtContextWrapper:
172
+ def __init__(self, trt_engine, trt_concurrent=1, device='cuda:0'):
173
+ self.trt_context_pool = queue.Queue(maxsize=trt_concurrent)
174
+ self.trt_engine = trt_engine
175
+ for _ in range(trt_concurrent):
176
+ trt_context = trt_engine.create_execution_context()
177
+ trt_stream = torch.cuda.stream(torch.cuda.Stream(device))
178
+ assert trt_context is not None, 'failed to create trt context, maybe not enough CUDA memory, try reduce current trt concurrent {}'.format(trt_concurrent)
179
+ self.trt_context_pool.put([trt_context, trt_stream])
180
+ assert self.trt_context_pool.empty() is False, 'no avaialbe estimator context'
181
+
182
+ def acquire_estimator(self):
183
+ return self.trt_context_pool.get(), self.trt_engine
184
+
185
+ def release_estimator(self, context, stream):
186
+ self.trt_context_pool.put([context, stream])
@@ -25,14 +25,16 @@ from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, l
25
25
 
26
26
  class Executor:
27
27
 
28
- def __init__(self, gan: bool = False):
28
+ def __init__(self, gan: bool = False, ref_model: torch.nn.Module = None, dpo_loss: torch.nn.Module = None):
29
29
  self.gan = gan
30
+ self.ref_model = ref_model
31
+ self.dpo_loss = dpo_loss
30
32
  self.step = 0
31
33
  self.epoch = 0
32
34
  self.rank = int(os.environ.get('RANK', 0))
33
35
  self.device = torch.device('cuda:{}'.format(self.rank))
34
36
 
35
- def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join):
37
+ def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=None):
36
38
  ''' Train one epoch
37
39
  '''
38
40
 
@@ -44,6 +46,8 @@ class Executor:
44
46
  # torch.nn.parallel.DistributedDataParallel to be able to train
45
47
  # with uneven inputs across participating processes.
46
48
  model.train()
49
+ if self.ref_model is not None:
50
+ self.ref_model.eval()
47
51
  model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
48
52
  with model_context():
49
53
  for batch_idx, batch_dict in enumerate(train_data_loader):
@@ -65,7 +69,7 @@ class Executor:
65
69
  context = nullcontext
66
70
 
67
71
  with context():
68
- info_dict = batch_forward(model, batch_dict, scaler, info_dict)
72
+ info_dict = batch_forward(model, batch_dict, scaler, info_dict, ref_model=self.ref_model, dpo_loss=self.dpo_loss)
69
73
  info_dict = batch_backward(model, scaler, info_dict)
70
74
 
71
75
  info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)
@@ -162,7 +166,7 @@ class Executor:
162
166
  for k, v in info_dict['loss_dict'].items():
163
167
  if k not in total_loss_dict:
164
168
  total_loss_dict[k] = []
165
- total_loss_dict[k].append(v.item() * num_utts)
169
+ total_loss_dict[k].append(v.mean().item() * num_utts)
166
170
  log_per_step(None, info_dict)
167
171
  for k, v in total_loss_dict.items():
168
172
  total_loss_dict[k] = sum(v) / total_num_utts
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
2
2
  # 2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
3
+ # 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li)
3
4
  #
4
5
  # Licensed under the Apache License, Version 2.0 (the "License");
5
6
  # you may not use this file except in compliance with the License.
@@ -13,7 +14,9 @@
13
14
  # See the License for the specific language governing permissions and
14
15
  # limitations under the License.
15
16
 
17
+ import os
16
18
  import json
19
+ import torch
17
20
  import torchaudio
18
21
  import logging
19
22
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
@@ -56,7 +59,7 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
56
59
  network = builder.create_network(network_flags)
57
60
  parser = trt.OnnxParser(network, logger)
58
61
  config = builder.create_builder_config()
59
- config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 33) # 8GB
62
+ config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32) # 4GB
60
63
  if fp16:
61
64
  config.set_flag(trt.BuilderFlag.FP16)
62
65
  profile = builder.create_optimization_profile()
@@ -83,3 +86,44 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
83
86
  with open(trt_model, "wb") as f:
84
87
  f.write(engine_bytes)
85
88
  logging.info("Succesfully convert onnx to trt...")
89
+
90
+
91
+ def export_cosyvoice2_vllm(model, model_path, device):
92
+ if os.path.exists(model_path):
93
+ return
94
+ pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
95
+ vocab_size = model.speech_embedding.num_embeddings
96
+ feature_size = model.speech_embedding.embedding_dim
97
+ pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
98
+
99
+ dtype = torch.bfloat16
100
+ # lm_head
101
+ new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True)
102
+ with torch.no_grad():
103
+ new_lm_head.weight[:vocab_size] = model.llm_decoder.weight
104
+ new_lm_head.bias[:vocab_size] = model.llm_decoder.bias
105
+ new_lm_head.weight[vocab_size:] = 0
106
+ new_lm_head.bias[vocab_size:] = 0
107
+ model.llm.model.lm_head = new_lm_head
108
+ new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
109
+ # embed_tokens
110
+ embed_tokens = model.llm.model.model.embed_tokens
111
+ with torch.no_grad():
112
+ new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight
113
+ new_codec_embed.weight[vocab_size:] = 0
114
+ model.llm.model.set_input_embeddings(new_codec_embed)
115
+ model.llm.model.to(device)
116
+ model.llm.model.to(dtype)
117
+ tmp_vocab_size = model.llm.model.config.vocab_size
118
+ tmp_tie_embedding = model.llm.model.config.tie_word_embeddings
119
+ del model.llm.model.generation_config.eos_token_id
120
+ del model.llm.model.config.bos_token_id
121
+ del model.llm.model.config.eos_token_id
122
+ model.llm.model.config.vocab_size = pad_vocab_size
123
+ model.llm.model.config.tie_word_embeddings = False
124
+ model.llm.model.config.use_bias = True
125
+ model.llm.model.save_pretrained(model_path)
126
+ os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
127
+ model.llm.model.config.vocab_size = tmp_vocab_size
128
+ model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
129
+ model.llm.model.set_input_embeddings(embed_tokens)
@@ -1,5 +1,6 @@
1
1
  import torch
2
2
  import torch.nn.functional as F
3
+ from typing import Tuple
3
4
 
4
5
 
5
6
  def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
@@ -18,3 +19,39 @@ def mel_loss(real_speech, generated_speech, mel_transforms):
18
19
  mel_g = transform(generated_speech)
19
20
  loss += F.l1_loss(mel_g, mel_r)
20
21
  return loss
22
+
23
+
24
+ class DPOLoss(torch.nn.Module):
25
+ """
26
+ DPO Loss
27
+ """
28
+
29
+ def __init__(self, beta: float, label_smoothing: float = 0.0, ipo: bool = False) -> None:
30
+ super().__init__()
31
+ self.beta = beta
32
+ self.label_smoothing = label_smoothing
33
+ self.ipo = ipo
34
+
35
+ def forward(
36
+ self,
37
+ policy_chosen_logps: torch.Tensor,
38
+ policy_rejected_logps: torch.Tensor,
39
+ reference_chosen_logps: torch.Tensor,
40
+ reference_rejected_logps: torch.Tensor,
41
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
42
+ pi_logratios = policy_chosen_logps - policy_rejected_logps
43
+ ref_logratios = reference_chosen_logps - reference_rejected_logps
44
+ logits = pi_logratios - ref_logratios
45
+ if self.ipo:
46
+ losses = (logits - 1 / (2 * self.beta)) ** 2 # Eq. 17 of https://arxiv.org/pdf/2310.12036v2.pdf
47
+ else:
48
+ # Eq. 3 https://ericmitchell.ai/cdpo.pdf; label_smoothing=0 gives original DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
49
+ losses = (
50
+ -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
51
+ - F.logsigmoid(-self.beta * logits) * self.label_smoothing
52
+ )
53
+ loss = losses.mean()
54
+ chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
55
+ rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
56
+
57
+ return loss, chosen_rewards, rejected_rewards
@@ -86,7 +86,7 @@ def subsequent_mask(
86
86
  return mask
87
87
 
88
88
 
89
- def subsequent_chunk_mask(
89
+ def subsequent_chunk_mask_deprecated(
90
90
  size: int,
91
91
  chunk_size: int,
92
92
  num_left_chunks: int = -1,
@@ -124,6 +124,40 @@ def subsequent_chunk_mask(
124
124
  return ret
125
125
 
126
126
 
127
+ def subsequent_chunk_mask(
128
+ size: int,
129
+ chunk_size: int,
130
+ num_left_chunks: int = -1,
131
+ device: torch.device = torch.device("cpu"),
132
+ ) -> torch.Tensor:
133
+ """Create mask for subsequent steps (size, size) with chunk size,
134
+ this is for streaming encoder
135
+
136
+ Args:
137
+ size (int): size of mask
138
+ chunk_size (int): size of chunk
139
+ num_left_chunks (int): number of left chunks
140
+ <0: use full chunk
141
+ >=0: use num_left_chunks
142
+ device (torch.device): "cpu" or "cuda" or torch.Tensor.device
143
+
144
+ Returns:
145
+ torch.Tensor: mask
146
+
147
+ Examples:
148
+ >>> subsequent_chunk_mask(4, 2)
149
+ [[1, 1, 0, 0],
150
+ [1, 1, 0, 0],
151
+ [1, 1, 1, 1],
152
+ [1, 1, 1, 1]]
153
+ """
154
+ # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
155
+ pos_idx = torch.arange(size, device=device)
156
+ block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
157
+ ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
158
+ return ret
159
+
160
+
127
161
  def add_optional_chunk_mask(xs: torch.Tensor,
128
162
  masks: torch.Tensor,
129
163
  use_dynamic_chunk: bool,
@@ -50,10 +50,10 @@ def init_distributed(args):
50
50
  return world_size, local_rank, rank
51
51
 
52
52
 
53
- def init_dataset_and_dataloader(args, configs, gan):
53
+ def init_dataset_and_dataloader(args, configs, gan, dpo):
54
54
  data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
55
- train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=True, partition=True)
56
- cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=False, partition=False)
55
+ train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True)
56
+ cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False)
57
57
 
58
58
  # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
59
59
  train_data_loader = DataLoader(train_dataset,
@@ -71,7 +71,7 @@ def init_dataset_and_dataloader(args, configs, gan):
71
71
 
72
72
  def check_modify_and_save_config(args, configs):
73
73
  if args.train_engine == "torch_ddp":
74
- configs['train_conf']["dtype"] = 'fp32'
74
+ configs['train_conf']["dtype"] = 'bf16' if args.use_amp is True else 'fp32'
75
75
  else:
76
76
  with open(args.deepspeed_config, 'r') as fin:
77
77
  ds_configs = json.load(fin)
@@ -235,7 +235,7 @@ def cosyvoice_join(group_join, info_dict):
235
235
  return False
236
236
 
237
237
 
238
- def batch_forward(model, batch, scaler, info_dict):
238
+ def batch_forward(model, batch, scaler, info_dict, ref_model=None, dpo_loss=None):
239
239
  device = int(os.environ.get('LOCAL_RANK', 0))
240
240
 
241
241
  dtype = info_dict["dtype"]
@@ -247,12 +247,30 @@ def batch_forward(model, batch, scaler, info_dict):
247
247
  dtype = torch.float32
248
248
 
249
249
  if info_dict['train_engine'] == 'torch_ddp':
250
- autocast = torch.cuda.amp.autocast(enabled=scaler is not None)
250
+ autocast = torch.cuda.amp.autocast(enabled=scaler is not None, dtype=dtype)
251
251
  else:
252
252
  autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False)
253
253
 
254
254
  with autocast:
255
255
  info_dict['loss_dict'] = model(batch, device)
256
+ if ref_model is not None and dpo_loss is not None:
257
+ chosen_logps = info_dict['loss_dict']["chosen_logps"]
258
+ rejected_logps = info_dict['loss_dict']["rejected_logps"]
259
+ sft_loss = info_dict['loss_dict']['loss']
260
+ with torch.no_grad():
261
+ ref_loss_dict = ref_model(batch, device)
262
+ reference_chosen_logps = ref_loss_dict["chosen_logps"]
263
+ reference_rejected_logps = ref_loss_dict["rejected_logps"]
264
+ preference_loss, chosen_reward, reject_reward = dpo_loss(
265
+ chosen_logps, rejected_logps, reference_chosen_logps, reference_rejected_logps
266
+ )
267
+ dpo_acc = (chosen_reward > reject_reward).float().mean()
268
+ info_dict['loss_dict']["loss"] = preference_loss + sft_loss
269
+ info_dict['loss_dict']["sft_loss"] = sft_loss
270
+ info_dict['loss_dict']["dpo_loss"] = preference_loss
271
+ info_dict['loss_dict']["dpo_acc"] = dpo_acc
272
+ info_dict['loss_dict']["chosen_reward"] = chosen_reward.mean()
273
+ info_dict['loss_dict']["reject_reward"] = reject_reward.mean()
256
274
  return info_dict
257
275
 
258
276
 
@@ -0,0 +1,103 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
5
+ # Copyright 2024 The Qwen team.
6
+ # Copyright 2023 The vLLM team.
7
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
10
+ # and OPT implementations in this library. It has been modified from its
11
+ # original forms to accommodate minor architectural differences compared
12
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
13
+ #
14
+ # Licensed under the Apache License, Version 2.0 (the "License");
15
+ # you may not use this file except in compliance with the License.
16
+ # You may obtain a copy of the License at
17
+ #
18
+ # http://www.apache.org/licenses/LICENSE-2.0
19
+ #
20
+ # Unless required by applicable law or agreed to in writing, software
21
+ # distributed under the License is distributed on an "AS IS" BASIS,
22
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23
+ # See the License for the specific language governing permissions and
24
+ # limitations under the License.
25
+ """Inference-only Qwen2 model compatible with HuggingFace weights."""
26
+ from vllm.model_executor.models.qwen2 import *
27
+
28
+
29
+ class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
30
+ packed_modules_mapping = {
31
+ "qkv_proj": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ ],
36
+ "gate_up_proj": [
37
+ "gate_proj",
38
+ "up_proj",
39
+ ],
40
+ }
41
+
42
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
43
+ super().__init__()
44
+ config = vllm_config.model_config.hf_config
45
+ quant_config = vllm_config.quant_config
46
+ lora_config = vllm_config.lora_config
47
+
48
+ self.config = config
49
+ self.lora_config = lora_config
50
+
51
+ self.quant_config = quant_config
52
+ self.model = Qwen2Model(vllm_config=vllm_config,
53
+ prefix=maybe_prefix(prefix, "model"))
54
+
55
+ if get_pp_group().is_last_rank:
56
+ if config.tie_word_embeddings:
57
+ self.lm_head = self.model.embed_tokens
58
+ else:
59
+ self.lm_head = ParallelLMHead(config.vocab_size,
60
+ config.hidden_size,
61
+ True,
62
+ quant_config=quant_config,
63
+ prefix=maybe_prefix(
64
+ prefix, "lm_head"))
65
+ else:
66
+ self.lm_head = PPMissingLayer()
67
+
68
+ self.logits_processor = LogitsProcessor(config.vocab_size)
69
+
70
+ self.make_empty_intermediate_tensors = (
71
+ self.model.make_empty_intermediate_tensors)
72
+
73
+ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
74
+ return self.model.get_input_embeddings(input_ids)
75
+
76
+ def forward(
77
+ self,
78
+ input_ids: torch.Tensor,
79
+ positions: torch.Tensor,
80
+ intermediate_tensors: Optional[IntermediateTensors] = None,
81
+ inputs_embeds: Optional[torch.Tensor] = None,
82
+ ) -> Union[torch.Tensor, IntermediateTensors]:
83
+ hidden_states = self.model(input_ids, positions, intermediate_tensors,
84
+ inputs_embeds)
85
+ return hidden_states
86
+
87
+ def compute_logits(
88
+ self,
89
+ hidden_states: torch.Tensor,
90
+ sampling_metadata: SamplingMetadata,
91
+ ) -> Optional[torch.Tensor]:
92
+ logits = self.logits_processor(self.lm_head, hidden_states,
93
+ sampling_metadata, self.lm_head.bias)
94
+ return logits
95
+
96
+ def load_weights(self, weights: Iterable[tuple[str,
97
+ torch.Tensor]]) -> set[str]:
98
+ loader = AutoWeightsLoader(
99
+ self,
100
+ skip_prefixes=(["lm_head."]
101
+ if self.config.tie_word_embeddings else None),
102
+ )
103
+ return loader.load_weights(weights)