xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/model.py +3 -4
- xinference/core/supervisor.py +29 -1
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +64 -20
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +19 -4
- xinference/model/embedding/vllm/core.py +40 -8
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +116 -9
- xinference/model/image/stable_diffusion/core.py +141 -31
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +42 -40
- xinference/model/llm/llm_family.json +435 -23
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +6 -12
- xinference/model/llm/utils.py +128 -46
- xinference/model/llm/vllm/core.py +8 -61
- xinference/model/rerank/core.py +3 -0
- xinference/model/rerank/sentence_transformers/core.py +1 -1
- xinference/model/rerank/vllm/core.py +56 -6
- xinference/model/utils.py +1 -2
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/types.py +105 -2
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
- xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -56,16 +56,11 @@ class Upsample1D(nn.Module):
|
|
|
56
56
|
# In this mode, first repeat interpolate, than conv with stride=1
|
|
57
57
|
self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
|
|
58
58
|
|
|
59
|
-
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor
|
|
59
|
+
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
60
60
|
outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
|
|
61
|
-
|
|
62
|
-
outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
|
|
63
|
-
else:
|
|
64
|
-
assert conv_cache.size(2) == self.stride * 2
|
|
65
|
-
outputs = torch.concat([conv_cache, outputs], dim=2)
|
|
66
|
-
conv_cache_new = outputs[:, :, -self.stride * 2:]
|
|
61
|
+
outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
|
|
67
62
|
outputs = self.conv(outputs)
|
|
68
|
-
return outputs, input_lengths * self.stride
|
|
63
|
+
return outputs, input_lengths * self.stride
|
|
69
64
|
|
|
70
65
|
|
|
71
66
|
class PreLookaheadLayer(nn.Module):
|
|
@@ -83,7 +78,7 @@ class PreLookaheadLayer(nn.Module):
|
|
|
83
78
|
kernel_size=3, stride=1, padding=0,
|
|
84
79
|
)
|
|
85
80
|
|
|
86
|
-
def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)
|
|
81
|
+
def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
|
|
87
82
|
"""
|
|
88
83
|
inputs: (batch_size, seq_len, channels)
|
|
89
84
|
"""
|
|
@@ -93,22 +88,18 @@ class PreLookaheadLayer(nn.Module):
|
|
|
93
88
|
if context.size(2) == 0:
|
|
94
89
|
outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
|
|
95
90
|
else:
|
|
91
|
+
assert self.training is False, 'you have passed context, make sure that you are running inference mode'
|
|
96
92
|
assert context.size(2) == self.pre_lookahead_len
|
|
97
93
|
outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
|
|
98
94
|
outputs = F.leaky_relu(self.conv1(outputs))
|
|
99
95
|
# outputs
|
|
100
|
-
|
|
101
|
-
outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
|
|
102
|
-
else:
|
|
103
|
-
assert conv2_cache.size(2) == self.conv2.kernel_size[0] - 1
|
|
104
|
-
outputs = torch.concat([conv2_cache, outputs], dim=2)
|
|
105
|
-
conv2_cache_new = outputs[:, :, -(self.conv2.kernel_size[0] - 1):]
|
|
96
|
+
outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
|
|
106
97
|
outputs = self.conv2(outputs)
|
|
107
98
|
outputs = outputs.transpose(1, 2).contiguous()
|
|
108
99
|
|
|
109
100
|
# residual connection
|
|
110
101
|
outputs = outputs + inputs
|
|
111
|
-
return outputs
|
|
102
|
+
return outputs
|
|
112
103
|
|
|
113
104
|
|
|
114
105
|
class UpsampleConformerEncoder(torch.nn.Module):
|
|
@@ -253,6 +244,7 @@ class UpsampleConformerEncoder(torch.nn.Module):
|
|
|
253
244
|
self,
|
|
254
245
|
xs: torch.Tensor,
|
|
255
246
|
xs_lens: torch.Tensor,
|
|
247
|
+
context: torch.Tensor = torch.zeros(0, 0, 0),
|
|
256
248
|
decoding_chunk_size: int = 0,
|
|
257
249
|
num_decoding_left_chunks: int = -1,
|
|
258
250
|
streaming: bool = False,
|
|
@@ -285,15 +277,19 @@ class UpsampleConformerEncoder(torch.nn.Module):
|
|
|
285
277
|
if self.global_cmvn is not None:
|
|
286
278
|
xs = self.global_cmvn(xs)
|
|
287
279
|
xs, pos_emb, masks = self.embed(xs, masks)
|
|
280
|
+
if context.size(1) != 0:
|
|
281
|
+
assert self.training is False, 'you have passed context, make sure that you are running inference mode'
|
|
282
|
+
context_masks = torch.ones(1, 1, context.size(1)).to(masks)
|
|
283
|
+
context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
|
|
288
284
|
mask_pad = masks # (B, 1, T/subsample_rate)
|
|
289
285
|
chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
|
|
290
286
|
# lookahead + conformer encoder
|
|
291
|
-
xs
|
|
287
|
+
xs = self.pre_lookahead_layer(xs, context=context)
|
|
292
288
|
xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
|
|
293
289
|
|
|
294
290
|
# upsample + conformer encoder
|
|
295
291
|
xs = xs.transpose(1, 2).contiguous()
|
|
296
|
-
xs, xs_lens
|
|
292
|
+
xs, xs_lens = self.up_layer(xs, xs_lens)
|
|
297
293
|
xs = xs.transpose(1, 2).contiguous()
|
|
298
294
|
T = xs.size(1)
|
|
299
295
|
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
|
@@ -322,100 +318,3 @@ class UpsampleConformerEncoder(torch.nn.Module):
|
|
|
322
318
|
for layer in self.up_encoders:
|
|
323
319
|
xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
|
|
324
320
|
return xs
|
|
325
|
-
|
|
326
|
-
@torch.jit.export
|
|
327
|
-
def forward_chunk(
|
|
328
|
-
self,
|
|
329
|
-
xs: torch.Tensor,
|
|
330
|
-
xs_lens: torch.Tensor,
|
|
331
|
-
offset: int = 0,
|
|
332
|
-
context: torch.Tensor = torch.zeros(0, 0, 0),
|
|
333
|
-
pre_lookahead_layer_conv2_cache: torch.Tensor = torch.zeros(0, 0, 0),
|
|
334
|
-
encoders_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0),
|
|
335
|
-
upsample_offset: int = 0,
|
|
336
|
-
upsample_conv_cache: torch.Tensor = torch.zeros(0, 0, 0),
|
|
337
|
-
upsample_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0)
|
|
338
|
-
) -> Tuple[torch.Tensor, torch.Tensor, Tuple[int, torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]]:
|
|
339
|
-
"""Embed positions in tensor.
|
|
340
|
-
|
|
341
|
-
Args:
|
|
342
|
-
xs: padded input tensor (B, T, D)
|
|
343
|
-
xs_lens: input length (B)
|
|
344
|
-
decoding_chunk_size: decoding chunk size for dynamic chunk
|
|
345
|
-
0: default for training, use random dynamic chunk.
|
|
346
|
-
<0: for decoding, use full chunk.
|
|
347
|
-
>0: for decoding, use fixed chunk size as set.
|
|
348
|
-
num_decoding_left_chunks: number of left chunks, this is for decoding,
|
|
349
|
-
the chunk size is decoding_chunk_size.
|
|
350
|
-
>=0: use num_decoding_left_chunks
|
|
351
|
-
<0: use all left chunks
|
|
352
|
-
Returns:
|
|
353
|
-
encoder output tensor xs, and subsampled masks
|
|
354
|
-
xs: padded output tensor (B, T' ~= T/subsample_rate, D)
|
|
355
|
-
masks: torch.Tensor batch padding mask after subsample
|
|
356
|
-
(B, 1, T' ~= T/subsample_rate)
|
|
357
|
-
NOTE(xcsong):
|
|
358
|
-
We pass the `__call__` method of the modules instead of `forward` to the
|
|
359
|
-
checkpointing API because `__call__` attaches all the hooks of the module.
|
|
360
|
-
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
|
|
361
|
-
"""
|
|
362
|
-
assert xs.size(0) == 1
|
|
363
|
-
# tmp_masks is just for interface compatibility
|
|
364
|
-
tmp_masks = torch.ones(1,
|
|
365
|
-
xs.size(1),
|
|
366
|
-
device=xs.device,
|
|
367
|
-
dtype=torch.bool)
|
|
368
|
-
tmp_masks = tmp_masks.unsqueeze(1)
|
|
369
|
-
if self.global_cmvn is not None:
|
|
370
|
-
xs = self.global_cmvn(xs)
|
|
371
|
-
# NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
|
|
372
|
-
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
|
|
373
|
-
offset += xs.size(1)
|
|
374
|
-
tmp_masks = torch.ones(1,
|
|
375
|
-
context.size(1),
|
|
376
|
-
device=context.device,
|
|
377
|
-
dtype=torch.bool)
|
|
378
|
-
tmp_masks = tmp_masks.unsqueeze(1)
|
|
379
|
-
if context.size(1) != 0:
|
|
380
|
-
context, _, _ = self.embed(context, tmp_masks, offset)
|
|
381
|
-
|
|
382
|
-
# lookahead + conformer encoder
|
|
383
|
-
xs, pre_lookahead_layer_conv2_cache = self.pre_lookahead_layer(xs, context, pre_lookahead_layer_conv2_cache)
|
|
384
|
-
# NOTE in cache mode we do not need to call add_optional_chunk_mask
|
|
385
|
-
chunk_masks = torch.ones((1, xs.size(1), offset), dtype=torch.bool, device=xs.device)
|
|
386
|
-
mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
|
|
387
|
-
encoders_kv_cache_list = []
|
|
388
|
-
for index, layer in enumerate(self.encoders):
|
|
389
|
-
xs, chunk_masks, encoders_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, encoders_kv_cache[index])
|
|
390
|
-
encoders_kv_cache_list.append(encoders_kv_cache_new)
|
|
391
|
-
encoders_kv_cache = torch.stack(encoders_kv_cache_list, dim=0)
|
|
392
|
-
|
|
393
|
-
# upsample
|
|
394
|
-
xs = xs.transpose(1, 2).contiguous()
|
|
395
|
-
xs, xs_lens, upsample_conv_cache = self.up_layer(xs, xs_lens, upsample_conv_cache)
|
|
396
|
-
xs = xs.transpose(1, 2).contiguous()
|
|
397
|
-
|
|
398
|
-
# tmp_masks is just for interface compatibility
|
|
399
|
-
tmp_masks = torch.ones(1,
|
|
400
|
-
xs.size(1),
|
|
401
|
-
device=xs.device,
|
|
402
|
-
dtype=torch.bool)
|
|
403
|
-
tmp_masks = tmp_masks.unsqueeze(1)
|
|
404
|
-
xs, pos_emb, masks = self.up_embed(xs, tmp_masks, upsample_offset)
|
|
405
|
-
upsample_offset += xs.size(1)
|
|
406
|
-
|
|
407
|
-
# conformer encoder
|
|
408
|
-
chunk_masks = torch.ones((1, xs.size(1), upsample_offset), dtype=torch.bool, device=xs.device)
|
|
409
|
-
mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
|
|
410
|
-
upsample_kv_cache_list = []
|
|
411
|
-
for index, layer in enumerate(self.up_encoders):
|
|
412
|
-
xs, chunk_masks, upsample_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, upsample_kv_cache[index])
|
|
413
|
-
upsample_kv_cache_list.append(upsample_kv_cache_new)
|
|
414
|
-
upsample_kv_cache = torch.stack(upsample_kv_cache_list, dim=0)
|
|
415
|
-
|
|
416
|
-
if self.normalize_before:
|
|
417
|
-
xs = self.after_norm(xs)
|
|
418
|
-
# Here we assume the mask is not changed in encoder layers, so just
|
|
419
|
-
# return the masks before encoder layers, and the masks will be used
|
|
420
|
-
# for cross attention with decoder later
|
|
421
|
-
return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache, upsample_offset, upsample_conv_cache, upsample_kv_cache)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
|
|
2
2
|
# 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
3
|
+
# 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
|
|
3
4
|
#
|
|
4
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
6
|
# you may not use this file except in compliance with the License.
|
|
@@ -15,6 +16,7 @@
|
|
|
15
16
|
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
16
17
|
"""Unility functions for Transformer."""
|
|
17
18
|
|
|
19
|
+
import queue
|
|
18
20
|
import random
|
|
19
21
|
from typing import List
|
|
20
22
|
|
|
@@ -164,3 +166,21 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
|
|
|
164
166
|
# chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
|
|
165
167
|
mask = (1.0 - mask) * -1.0e+10
|
|
166
168
|
return mask
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class TrtContextWrapper:
|
|
172
|
+
def __init__(self, trt_engine, trt_concurrent=1, device='cuda:0'):
|
|
173
|
+
self.trt_context_pool = queue.Queue(maxsize=trt_concurrent)
|
|
174
|
+
self.trt_engine = trt_engine
|
|
175
|
+
for _ in range(trt_concurrent):
|
|
176
|
+
trt_context = trt_engine.create_execution_context()
|
|
177
|
+
trt_stream = torch.cuda.stream(torch.cuda.Stream(device))
|
|
178
|
+
assert trt_context is not None, 'failed to create trt context, maybe not enough CUDA memory, try reduce current trt concurrent {}'.format(trt_concurrent)
|
|
179
|
+
self.trt_context_pool.put([trt_context, trt_stream])
|
|
180
|
+
assert self.trt_context_pool.empty() is False, 'no avaialbe estimator context'
|
|
181
|
+
|
|
182
|
+
def acquire_estimator(self):
|
|
183
|
+
return self.trt_context_pool.get(), self.trt_engine
|
|
184
|
+
|
|
185
|
+
def release_estimator(self, context, stream):
|
|
186
|
+
self.trt_context_pool.put([context, stream])
|
|
@@ -25,14 +25,16 @@ from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, l
|
|
|
25
25
|
|
|
26
26
|
class Executor:
|
|
27
27
|
|
|
28
|
-
def __init__(self, gan: bool = False):
|
|
28
|
+
def __init__(self, gan: bool = False, ref_model: torch.nn.Module = None, dpo_loss: torch.nn.Module = None):
|
|
29
29
|
self.gan = gan
|
|
30
|
+
self.ref_model = ref_model
|
|
31
|
+
self.dpo_loss = dpo_loss
|
|
30
32
|
self.step = 0
|
|
31
33
|
self.epoch = 0
|
|
32
34
|
self.rank = int(os.environ.get('RANK', 0))
|
|
33
35
|
self.device = torch.device('cuda:{}'.format(self.rank))
|
|
34
36
|
|
|
35
|
-
def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join):
|
|
37
|
+
def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=None):
|
|
36
38
|
''' Train one epoch
|
|
37
39
|
'''
|
|
38
40
|
|
|
@@ -44,6 +46,8 @@ class Executor:
|
|
|
44
46
|
# torch.nn.parallel.DistributedDataParallel to be able to train
|
|
45
47
|
# with uneven inputs across participating processes.
|
|
46
48
|
model.train()
|
|
49
|
+
if self.ref_model is not None:
|
|
50
|
+
self.ref_model.eval()
|
|
47
51
|
model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
|
|
48
52
|
with model_context():
|
|
49
53
|
for batch_idx, batch_dict in enumerate(train_data_loader):
|
|
@@ -65,7 +69,7 @@ class Executor:
|
|
|
65
69
|
context = nullcontext
|
|
66
70
|
|
|
67
71
|
with context():
|
|
68
|
-
info_dict = batch_forward(model, batch_dict, scaler, info_dict)
|
|
72
|
+
info_dict = batch_forward(model, batch_dict, scaler, info_dict, ref_model=self.ref_model, dpo_loss=self.dpo_loss)
|
|
69
73
|
info_dict = batch_backward(model, scaler, info_dict)
|
|
70
74
|
|
|
71
75
|
info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)
|
|
@@ -162,7 +166,7 @@ class Executor:
|
|
|
162
166
|
for k, v in info_dict['loss_dict'].items():
|
|
163
167
|
if k not in total_loss_dict:
|
|
164
168
|
total_loss_dict[k] = []
|
|
165
|
-
total_loss_dict[k].append(v.item() * num_utts)
|
|
169
|
+
total_loss_dict[k].append(v.mean().item() * num_utts)
|
|
166
170
|
log_per_step(None, info_dict)
|
|
167
171
|
for k, v in total_loss_dict.items():
|
|
168
172
|
total_loss_dict[k] = sum(v) / total_num_utts
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
|
|
2
2
|
# 2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
|
|
3
|
+
# 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li)
|
|
3
4
|
#
|
|
4
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
6
|
# you may not use this file except in compliance with the License.
|
|
@@ -13,7 +14,9 @@
|
|
|
13
14
|
# See the License for the specific language governing permissions and
|
|
14
15
|
# limitations under the License.
|
|
15
16
|
|
|
17
|
+
import os
|
|
16
18
|
import json
|
|
19
|
+
import torch
|
|
17
20
|
import torchaudio
|
|
18
21
|
import logging
|
|
19
22
|
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
|
@@ -56,7 +59,7 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
|
|
|
56
59
|
network = builder.create_network(network_flags)
|
|
57
60
|
parser = trt.OnnxParser(network, logger)
|
|
58
61
|
config = builder.create_builder_config()
|
|
59
|
-
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 <<
|
|
62
|
+
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32) # 4GB
|
|
60
63
|
if fp16:
|
|
61
64
|
config.set_flag(trt.BuilderFlag.FP16)
|
|
62
65
|
profile = builder.create_optimization_profile()
|
|
@@ -83,3 +86,44 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
|
|
|
83
86
|
with open(trt_model, "wb") as f:
|
|
84
87
|
f.write(engine_bytes)
|
|
85
88
|
logging.info("Succesfully convert onnx to trt...")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def export_cosyvoice2_vllm(model, model_path, device):
|
|
92
|
+
if os.path.exists(model_path):
|
|
93
|
+
return
|
|
94
|
+
pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
|
|
95
|
+
vocab_size = model.speech_embedding.num_embeddings
|
|
96
|
+
feature_size = model.speech_embedding.embedding_dim
|
|
97
|
+
pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
|
|
98
|
+
|
|
99
|
+
dtype = torch.bfloat16
|
|
100
|
+
# lm_head
|
|
101
|
+
new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True)
|
|
102
|
+
with torch.no_grad():
|
|
103
|
+
new_lm_head.weight[:vocab_size] = model.llm_decoder.weight
|
|
104
|
+
new_lm_head.bias[:vocab_size] = model.llm_decoder.bias
|
|
105
|
+
new_lm_head.weight[vocab_size:] = 0
|
|
106
|
+
new_lm_head.bias[vocab_size:] = 0
|
|
107
|
+
model.llm.model.lm_head = new_lm_head
|
|
108
|
+
new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
|
|
109
|
+
# embed_tokens
|
|
110
|
+
embed_tokens = model.llm.model.model.embed_tokens
|
|
111
|
+
with torch.no_grad():
|
|
112
|
+
new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight
|
|
113
|
+
new_codec_embed.weight[vocab_size:] = 0
|
|
114
|
+
model.llm.model.set_input_embeddings(new_codec_embed)
|
|
115
|
+
model.llm.model.to(device)
|
|
116
|
+
model.llm.model.to(dtype)
|
|
117
|
+
tmp_vocab_size = model.llm.model.config.vocab_size
|
|
118
|
+
tmp_tie_embedding = model.llm.model.config.tie_word_embeddings
|
|
119
|
+
del model.llm.model.generation_config.eos_token_id
|
|
120
|
+
del model.llm.model.config.bos_token_id
|
|
121
|
+
del model.llm.model.config.eos_token_id
|
|
122
|
+
model.llm.model.config.vocab_size = pad_vocab_size
|
|
123
|
+
model.llm.model.config.tie_word_embeddings = False
|
|
124
|
+
model.llm.model.config.use_bias = True
|
|
125
|
+
model.llm.model.save_pretrained(model_path)
|
|
126
|
+
os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
|
|
127
|
+
model.llm.model.config.vocab_size = tmp_vocab_size
|
|
128
|
+
model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
|
|
129
|
+
model.llm.model.set_input_embeddings(embed_tokens)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
import torch.nn.functional as F
|
|
3
|
+
from typing import Tuple
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
|
|
@@ -18,3 +19,39 @@ def mel_loss(real_speech, generated_speech, mel_transforms):
|
|
|
18
19
|
mel_g = transform(generated_speech)
|
|
19
20
|
loss += F.l1_loss(mel_g, mel_r)
|
|
20
21
|
return loss
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DPOLoss(torch.nn.Module):
|
|
25
|
+
"""
|
|
26
|
+
DPO Loss
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, beta: float, label_smoothing: float = 0.0, ipo: bool = False) -> None:
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.beta = beta
|
|
32
|
+
self.label_smoothing = label_smoothing
|
|
33
|
+
self.ipo = ipo
|
|
34
|
+
|
|
35
|
+
def forward(
|
|
36
|
+
self,
|
|
37
|
+
policy_chosen_logps: torch.Tensor,
|
|
38
|
+
policy_rejected_logps: torch.Tensor,
|
|
39
|
+
reference_chosen_logps: torch.Tensor,
|
|
40
|
+
reference_rejected_logps: torch.Tensor,
|
|
41
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
42
|
+
pi_logratios = policy_chosen_logps - policy_rejected_logps
|
|
43
|
+
ref_logratios = reference_chosen_logps - reference_rejected_logps
|
|
44
|
+
logits = pi_logratios - ref_logratios
|
|
45
|
+
if self.ipo:
|
|
46
|
+
losses = (logits - 1 / (2 * self.beta)) ** 2 # Eq. 17 of https://arxiv.org/pdf/2310.12036v2.pdf
|
|
47
|
+
else:
|
|
48
|
+
# Eq. 3 https://ericmitchell.ai/cdpo.pdf; label_smoothing=0 gives original DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
|
|
49
|
+
losses = (
|
|
50
|
+
-F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
|
|
51
|
+
- F.logsigmoid(-self.beta * logits) * self.label_smoothing
|
|
52
|
+
)
|
|
53
|
+
loss = losses.mean()
|
|
54
|
+
chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
|
|
55
|
+
rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
|
|
56
|
+
|
|
57
|
+
return loss, chosen_rewards, rejected_rewards
|
|
@@ -86,7 +86,7 @@ def subsequent_mask(
|
|
|
86
86
|
return mask
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
def
|
|
89
|
+
def subsequent_chunk_mask_deprecated(
|
|
90
90
|
size: int,
|
|
91
91
|
chunk_size: int,
|
|
92
92
|
num_left_chunks: int = -1,
|
|
@@ -124,6 +124,40 @@ def subsequent_chunk_mask(
|
|
|
124
124
|
return ret
|
|
125
125
|
|
|
126
126
|
|
|
127
|
+
def subsequent_chunk_mask(
|
|
128
|
+
size: int,
|
|
129
|
+
chunk_size: int,
|
|
130
|
+
num_left_chunks: int = -1,
|
|
131
|
+
device: torch.device = torch.device("cpu"),
|
|
132
|
+
) -> torch.Tensor:
|
|
133
|
+
"""Create mask for subsequent steps (size, size) with chunk size,
|
|
134
|
+
this is for streaming encoder
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
size (int): size of mask
|
|
138
|
+
chunk_size (int): size of chunk
|
|
139
|
+
num_left_chunks (int): number of left chunks
|
|
140
|
+
<0: use full chunk
|
|
141
|
+
>=0: use num_left_chunks
|
|
142
|
+
device (torch.device): "cpu" or "cuda" or torch.Tensor.device
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
torch.Tensor: mask
|
|
146
|
+
|
|
147
|
+
Examples:
|
|
148
|
+
>>> subsequent_chunk_mask(4, 2)
|
|
149
|
+
[[1, 1, 0, 0],
|
|
150
|
+
[1, 1, 0, 0],
|
|
151
|
+
[1, 1, 1, 1],
|
|
152
|
+
[1, 1, 1, 1]]
|
|
153
|
+
"""
|
|
154
|
+
# NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
|
|
155
|
+
pos_idx = torch.arange(size, device=device)
|
|
156
|
+
block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
|
|
157
|
+
ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
|
|
158
|
+
return ret
|
|
159
|
+
|
|
160
|
+
|
|
127
161
|
def add_optional_chunk_mask(xs: torch.Tensor,
|
|
128
162
|
masks: torch.Tensor,
|
|
129
163
|
use_dynamic_chunk: bool,
|
|
@@ -50,10 +50,10 @@ def init_distributed(args):
|
|
|
50
50
|
return world_size, local_rank, rank
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def init_dataset_and_dataloader(args, configs, gan):
|
|
53
|
+
def init_dataset_and_dataloader(args, configs, gan, dpo):
|
|
54
54
|
data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
|
|
55
|
-
train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=True, partition=True)
|
|
56
|
-
cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=False, partition=False)
|
|
55
|
+
train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True)
|
|
56
|
+
cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False)
|
|
57
57
|
|
|
58
58
|
# do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
|
|
59
59
|
train_data_loader = DataLoader(train_dataset,
|
|
@@ -71,7 +71,7 @@ def init_dataset_and_dataloader(args, configs, gan):
|
|
|
71
71
|
|
|
72
72
|
def check_modify_and_save_config(args, configs):
|
|
73
73
|
if args.train_engine == "torch_ddp":
|
|
74
|
-
configs['train_conf']["dtype"] = 'fp32'
|
|
74
|
+
configs['train_conf']["dtype"] = 'bf16' if args.use_amp is True else 'fp32'
|
|
75
75
|
else:
|
|
76
76
|
with open(args.deepspeed_config, 'r') as fin:
|
|
77
77
|
ds_configs = json.load(fin)
|
|
@@ -235,7 +235,7 @@ def cosyvoice_join(group_join, info_dict):
|
|
|
235
235
|
return False
|
|
236
236
|
|
|
237
237
|
|
|
238
|
-
def batch_forward(model, batch, scaler, info_dict):
|
|
238
|
+
def batch_forward(model, batch, scaler, info_dict, ref_model=None, dpo_loss=None):
|
|
239
239
|
device = int(os.environ.get('LOCAL_RANK', 0))
|
|
240
240
|
|
|
241
241
|
dtype = info_dict["dtype"]
|
|
@@ -247,12 +247,30 @@ def batch_forward(model, batch, scaler, info_dict):
|
|
|
247
247
|
dtype = torch.float32
|
|
248
248
|
|
|
249
249
|
if info_dict['train_engine'] == 'torch_ddp':
|
|
250
|
-
autocast = torch.cuda.amp.autocast(enabled=scaler is not None)
|
|
250
|
+
autocast = torch.cuda.amp.autocast(enabled=scaler is not None, dtype=dtype)
|
|
251
251
|
else:
|
|
252
252
|
autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False)
|
|
253
253
|
|
|
254
254
|
with autocast:
|
|
255
255
|
info_dict['loss_dict'] = model(batch, device)
|
|
256
|
+
if ref_model is not None and dpo_loss is not None:
|
|
257
|
+
chosen_logps = info_dict['loss_dict']["chosen_logps"]
|
|
258
|
+
rejected_logps = info_dict['loss_dict']["rejected_logps"]
|
|
259
|
+
sft_loss = info_dict['loss_dict']['loss']
|
|
260
|
+
with torch.no_grad():
|
|
261
|
+
ref_loss_dict = ref_model(batch, device)
|
|
262
|
+
reference_chosen_logps = ref_loss_dict["chosen_logps"]
|
|
263
|
+
reference_rejected_logps = ref_loss_dict["rejected_logps"]
|
|
264
|
+
preference_loss, chosen_reward, reject_reward = dpo_loss(
|
|
265
|
+
chosen_logps, rejected_logps, reference_chosen_logps, reference_rejected_logps
|
|
266
|
+
)
|
|
267
|
+
dpo_acc = (chosen_reward > reject_reward).float().mean()
|
|
268
|
+
info_dict['loss_dict']["loss"] = preference_loss + sft_loss
|
|
269
|
+
info_dict['loss_dict']["sft_loss"] = sft_loss
|
|
270
|
+
info_dict['loss_dict']["dpo_loss"] = preference_loss
|
|
271
|
+
info_dict['loss_dict']["dpo_acc"] = dpo_acc
|
|
272
|
+
info_dict['loss_dict']["chosen_reward"] = chosen_reward.mean()
|
|
273
|
+
info_dict['loss_dict']["reject_reward"] = reject_reward.mean()
|
|
256
274
|
return info_dict
|
|
257
275
|
|
|
258
276
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
|
|
3
|
+
# Adapted from
|
|
4
|
+
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
|
|
5
|
+
# Copyright 2024 The Qwen team.
|
|
6
|
+
# Copyright 2023 The vLLM team.
|
|
7
|
+
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
|
8
|
+
#
|
|
9
|
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
10
|
+
# and OPT implementations in this library. It has been modified from its
|
|
11
|
+
# original forms to accommodate minor architectural differences compared
|
|
12
|
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
13
|
+
#
|
|
14
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
15
|
+
# you may not use this file except in compliance with the License.
|
|
16
|
+
# You may obtain a copy of the License at
|
|
17
|
+
#
|
|
18
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
19
|
+
#
|
|
20
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
21
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
22
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
23
|
+
# See the License for the specific language governing permissions and
|
|
24
|
+
# limitations under the License.
|
|
25
|
+
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
|
|
26
|
+
from vllm.model_executor.models.qwen2 import *
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
|
30
|
+
packed_modules_mapping = {
|
|
31
|
+
"qkv_proj": [
|
|
32
|
+
"q_proj",
|
|
33
|
+
"k_proj",
|
|
34
|
+
"v_proj",
|
|
35
|
+
],
|
|
36
|
+
"gate_up_proj": [
|
|
37
|
+
"gate_proj",
|
|
38
|
+
"up_proj",
|
|
39
|
+
],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
|
43
|
+
super().__init__()
|
|
44
|
+
config = vllm_config.model_config.hf_config
|
|
45
|
+
quant_config = vllm_config.quant_config
|
|
46
|
+
lora_config = vllm_config.lora_config
|
|
47
|
+
|
|
48
|
+
self.config = config
|
|
49
|
+
self.lora_config = lora_config
|
|
50
|
+
|
|
51
|
+
self.quant_config = quant_config
|
|
52
|
+
self.model = Qwen2Model(vllm_config=vllm_config,
|
|
53
|
+
prefix=maybe_prefix(prefix, "model"))
|
|
54
|
+
|
|
55
|
+
if get_pp_group().is_last_rank:
|
|
56
|
+
if config.tie_word_embeddings:
|
|
57
|
+
self.lm_head = self.model.embed_tokens
|
|
58
|
+
else:
|
|
59
|
+
self.lm_head = ParallelLMHead(config.vocab_size,
|
|
60
|
+
config.hidden_size,
|
|
61
|
+
True,
|
|
62
|
+
quant_config=quant_config,
|
|
63
|
+
prefix=maybe_prefix(
|
|
64
|
+
prefix, "lm_head"))
|
|
65
|
+
else:
|
|
66
|
+
self.lm_head = PPMissingLayer()
|
|
67
|
+
|
|
68
|
+
self.logits_processor = LogitsProcessor(config.vocab_size)
|
|
69
|
+
|
|
70
|
+
self.make_empty_intermediate_tensors = (
|
|
71
|
+
self.model.make_empty_intermediate_tensors)
|
|
72
|
+
|
|
73
|
+
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
|
74
|
+
return self.model.get_input_embeddings(input_ids)
|
|
75
|
+
|
|
76
|
+
def forward(
|
|
77
|
+
self,
|
|
78
|
+
input_ids: torch.Tensor,
|
|
79
|
+
positions: torch.Tensor,
|
|
80
|
+
intermediate_tensors: Optional[IntermediateTensors] = None,
|
|
81
|
+
inputs_embeds: Optional[torch.Tensor] = None,
|
|
82
|
+
) -> Union[torch.Tensor, IntermediateTensors]:
|
|
83
|
+
hidden_states = self.model(input_ids, positions, intermediate_tensors,
|
|
84
|
+
inputs_embeds)
|
|
85
|
+
return hidden_states
|
|
86
|
+
|
|
87
|
+
def compute_logits(
|
|
88
|
+
self,
|
|
89
|
+
hidden_states: torch.Tensor,
|
|
90
|
+
sampling_metadata: SamplingMetadata,
|
|
91
|
+
) -> Optional[torch.Tensor]:
|
|
92
|
+
logits = self.logits_processor(self.lm_head, hidden_states,
|
|
93
|
+
sampling_metadata, self.lm_head.bias)
|
|
94
|
+
return logits
|
|
95
|
+
|
|
96
|
+
def load_weights(self, weights: Iterable[tuple[str,
|
|
97
|
+
torch.Tensor]]) -> set[str]:
|
|
98
|
+
loader = AutoWeightsLoader(
|
|
99
|
+
self,
|
|
100
|
+
skip_prefixes=(["lm_head."]
|
|
101
|
+
if self.config.tie_word_embeddings else None),
|
|
102
|
+
)
|
|
103
|
+
return loader.load_weights(weights)
|