PyPI - xinference - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py CHANGED Viewed

@@ -56,16 +56,11 @@ class Upsample1D(nn.Module):
         # In this mode, first repeat interpolate, than conv with stride=1
         self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
-    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor, conv_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
-        if conv_cache.size(2) == 0:
-            outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
-        else:
-            assert conv_cache.size(2) == self.stride * 2
-            outputs = torch.concat([conv_cache, outputs], dim=2)
-        conv_cache_new = outputs[:, :, -self.stride * 2:]
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
         outputs = self.conv(outputs)
-        return outputs, input_lengths * self.stride, conv_cache_new
+        return outputs, input_lengths * self.stride
 class PreLookaheadLayer(nn.Module):
@@ -83,7 +78,7 @@ class PreLookaheadLayer(nn.Module):
             kernel_size=3, stride=1, padding=0,
         )
-    def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0), conv2_cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
         """
         inputs: (batch_size, seq_len, channels)
         """
@@ -93,22 +88,18 @@ class PreLookaheadLayer(nn.Module):
         if context.size(2) == 0:
             outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
         else:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
             assert context.size(2) == self.pre_lookahead_len
             outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
         outputs = F.leaky_relu(self.conv1(outputs))
         # outputs
-        if conv2_cache.size(2) == 0:
-            outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
-        else:
-            assert conv2_cache.size(2) == self.conv2.kernel_size[0] - 1
-            outputs = torch.concat([conv2_cache, outputs], dim=2)
-        conv2_cache_new = outputs[:, :, -(self.conv2.kernel_size[0] - 1):]
+        outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
         outputs = self.conv2(outputs)
         outputs = outputs.transpose(1, 2).contiguous()
         # residual connection
         outputs = outputs + inputs
-        return outputs, conv2_cache_new
+        return outputs
 class UpsampleConformerEncoder(torch.nn.Module):
@@ -253,6 +244,7 @@ class UpsampleConformerEncoder(torch.nn.Module):
         self,
         xs: torch.Tensor,
         xs_lens: torch.Tensor,
+        context: torch.Tensor = torch.zeros(0, 0, 0),
         decoding_chunk_size: int = 0,
         num_decoding_left_chunks: int = -1,
         streaming: bool = False,
@@ -285,15 +277,19 @@ class UpsampleConformerEncoder(torch.nn.Module):
         if self.global_cmvn is not None:
             xs = self.global_cmvn(xs)
         xs, pos_emb, masks = self.embed(xs, masks)
+        if context.size(1) != 0:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
+            context_masks = torch.ones(1, 1, context.size(1)).to(masks)
+            context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
         mask_pad = masks  # (B, 1, T/subsample_rate)
         chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
         # lookahead + conformer encoder
-        xs, _ = self.pre_lookahead_layer(xs)
+        xs = self.pre_lookahead_layer(xs, context=context)
         xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
         # upsample + conformer encoder
         xs = xs.transpose(1, 2).contiguous()
-        xs, xs_lens, _ = self.up_layer(xs, xs_lens)
+        xs, xs_lens = self.up_layer(xs, xs_lens)
         xs = xs.transpose(1, 2).contiguous()
         T = xs.size(1)
         masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
@@ -322,100 +318,3 @@ class UpsampleConformerEncoder(torch.nn.Module):
         for layer in self.up_encoders:
             xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
         return xs
-    @torch.jit.export
-    def forward_chunk(
-        self,
-        xs: torch.Tensor,
-        xs_lens: torch.Tensor,
-        offset: int = 0,
-        context: torch.Tensor = torch.zeros(0, 0, 0),
-        pre_lookahead_layer_conv2_cache: torch.Tensor = torch.zeros(0, 0, 0),
-        encoders_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0),
-        upsample_offset: int = 0,
-        upsample_conv_cache: torch.Tensor = torch.zeros(0, 0, 0),
-        upsample_kv_cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0)
-    ) -> Tuple[torch.Tensor, torch.Tensor, Tuple[int, torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]]:
-        """Embed positions in tensor.
-        Args:
-            xs: padded input tensor (B, T, D)
-            xs_lens: input length (B)
-            decoding_chunk_size: decoding chunk size for dynamic chunk
-                0: default for training, use random dynamic chunk.
-                <0: for decoding, use full chunk.
-                >0: for decoding, use fixed chunk size as set.
-            num_decoding_left_chunks: number of left chunks, this is for decoding,
-            the chunk size is decoding_chunk_size.
-                >=0: use num_decoding_left_chunks
-                <0: use all left chunks
-        Returns:
-            encoder output tensor xs, and subsampled masks
-            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
-            masks: torch.Tensor batch padding mask after subsample
-                (B, 1, T' ~= T/subsample_rate)
-        NOTE(xcsong):
-            We pass the `__call__` method of the modules instead of `forward` to the
-            checkpointing API because `__call__` attaches all the hooks of the module.
-            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
-        """
-        assert xs.size(0) == 1
-        # tmp_masks is just for interface compatibility
-        tmp_masks = torch.ones(1,
-                               xs.size(1),
-                               device=xs.device,
-                               dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
-        if self.global_cmvn is not None:
-            xs = self.global_cmvn(xs)
-        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
-        offset += xs.size(1)
-        tmp_masks = torch.ones(1,
-                               context.size(1),
-                               device=context.device,
-                               dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
-        if context.size(1) != 0:
-            context, _, _ = self.embed(context, tmp_masks, offset)
-        # lookahead + conformer encoder
-        xs, pre_lookahead_layer_conv2_cache = self.pre_lookahead_layer(xs, context, pre_lookahead_layer_conv2_cache)
-        # NOTE in cache mode we do not need to call add_optional_chunk_mask
-        chunk_masks = torch.ones((1, xs.size(1), offset), dtype=torch.bool, device=xs.device)
-        mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
-        encoders_kv_cache_list = []
-        for index, layer in enumerate(self.encoders):
-            xs, chunk_masks, encoders_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, encoders_kv_cache[index])
-            encoders_kv_cache_list.append(encoders_kv_cache_new)
-        encoders_kv_cache = torch.stack(encoders_kv_cache_list, dim=0)
-        # upsample
-        xs = xs.transpose(1, 2).contiguous()
-        xs, xs_lens, upsample_conv_cache = self.up_layer(xs, xs_lens, upsample_conv_cache)
-        xs = xs.transpose(1, 2).contiguous()
-        # tmp_masks is just for interface compatibility
-        tmp_masks = torch.ones(1,
-                               xs.size(1),
-                               device=xs.device,
-                               dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
-        xs, pos_emb, masks = self.up_embed(xs, tmp_masks, upsample_offset)
-        upsample_offset += xs.size(1)
-        # conformer encoder
-        chunk_masks = torch.ones((1, xs.size(1), upsample_offset), dtype=torch.bool, device=xs.device)
-        mask_pad = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
-        upsample_kv_cache_list = []
-        for index, layer in enumerate(self.up_encoders):
-            xs, chunk_masks, upsample_kv_cache_new, _ = layer(xs, chunk_masks, pos_emb, mask_pad, upsample_kv_cache[index])
-            upsample_kv_cache_list.append(upsample_kv_cache_new)
-        upsample_kv_cache = torch.stack(upsample_kv_cache_list, dim=0)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        # Here we assume the mask is not changed in encoder layers, so just
-        # return the masks before encoder layers, and the masks will be used
-        # for cross attention with decoder later
-        return xs, masks, (offset, pre_lookahead_layer_conv2_cache, encoders_kv_cache, upsample_offset, upsample_conv_cache, upsample_kv_cache)

xinference/thirdparty/cosyvoice/utils/common.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 #               2024 Alibaba Inc (authors: Xiang Lyu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +16,7 @@
 # Modified from ESPnet(https://github.com/espnet/espnet)
 """Unility functions for Transformer."""
+import queue
 import random
 from typing import List
@@ -164,3 +166,21 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
     mask = (1.0 - mask) * -1.0e+10
     return mask
+class TrtContextWrapper:
+    def __init__(self, trt_engine, trt_concurrent=1, device='cuda:0'):
+        self.trt_context_pool = queue.Queue(maxsize=trt_concurrent)
+        self.trt_engine = trt_engine
+        for _ in range(trt_concurrent):
+            trt_context = trt_engine.create_execution_context()
+            trt_stream = torch.cuda.stream(torch.cuda.Stream(device))
+            assert trt_context is not None, 'failed to create trt context, maybe not enough CUDA memory, try reduce current trt concurrent {}'.format(trt_concurrent)
+            self.trt_context_pool.put([trt_context, trt_stream])
+        assert self.trt_context_pool.empty() is False, 'no avaialbe estimator context'
+    def acquire_estimator(self):
+        return self.trt_context_pool.get(), self.trt_engine
+    def release_estimator(self, context, stream):
+        self.trt_context_pool.put([context, stream])

xinference/thirdparty/cosyvoice/utils/executor.py CHANGED Viewed

@@ -25,14 +25,16 @@ from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, l
 class Executor:
-    def __init__(self, gan: bool = False):
+    def __init__(self, gan: bool = False, ref_model: torch.nn.Module = None, dpo_loss: torch.nn.Module = None):
         self.gan = gan
+        self.ref_model = ref_model
+        self.dpo_loss = dpo_loss
         self.step = 0
         self.epoch = 0
         self.rank = int(os.environ.get('RANK', 0))
         self.device = torch.device('cuda:{}'.format(self.rank))
-    def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join):
+    def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=None):
         ''' Train one epoch
         '''
@@ -44,6 +46,8 @@ class Executor:
         # torch.nn.parallel.DistributedDataParallel to be able to train
         # with uneven inputs across participating processes.
         model.train()
+        if self.ref_model is not None:
+            self.ref_model.eval()
         model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
         with model_context():
             for batch_idx, batch_dict in enumerate(train_data_loader):
@@ -65,7 +69,7 @@ class Executor:
                     context = nullcontext
                 with context():
-                    info_dict = batch_forward(model, batch_dict, scaler, info_dict)
+                    info_dict = batch_forward(model, batch_dict, scaler, info_dict, ref_model=self.ref_model, dpo_loss=self.dpo_loss)
                     info_dict = batch_backward(model, scaler, info_dict)
                 info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)
@@ -162,7 +166,7 @@ class Executor:
             for k, v in info_dict['loss_dict'].items():
                 if k not in total_loss_dict:
                     total_loss_dict[k] = []
-                total_loss_dict[k].append(v.item() * num_utts)
+                total_loss_dict[k].append(v.mean().item() * num_utts)
             log_per_step(None, info_dict)
         for k, v in total_loss_dict.items():
             total_loss_dict[k] = sum(v) / total_num_utts

xinference/thirdparty/cosyvoice/utils/file_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
 #               2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import json
+import torch
 import torchaudio
 import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
@@ -56,7 +59,7 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
     network = builder.create_network(network_flags)
     parser = trt.OnnxParser(network, logger)
     config = builder.create_builder_config()
-    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 33)  # 8GB
+    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32)  # 4GB
     if fp16:
         config.set_flag(trt.BuilderFlag.FP16)
     profile = builder.create_optimization_profile()
@@ -83,3 +86,44 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
     with open(trt_model, "wb") as f:
         f.write(engine_bytes)
     logging.info("Succesfully convert onnx to trt...")
+def export_cosyvoice2_vllm(model, model_path, device):
+    if os.path.exists(model_path):
+        return
+    pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
+    vocab_size = model.speech_embedding.num_embeddings
+    feature_size = model.speech_embedding.embedding_dim
+    pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
+    dtype = torch.bfloat16
+    # lm_head
+    new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True)
+    with torch.no_grad():
+        new_lm_head.weight[:vocab_size] = model.llm_decoder.weight
+        new_lm_head.bias[:vocab_size] = model.llm_decoder.bias
+        new_lm_head.weight[vocab_size:] = 0
+        new_lm_head.bias[vocab_size:] = 0
+    model.llm.model.lm_head = new_lm_head
+    new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
+    # embed_tokens
+    embed_tokens = model.llm.model.model.embed_tokens
+    with torch.no_grad():
+        new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight
+        new_codec_embed.weight[vocab_size:] = 0
+    model.llm.model.set_input_embeddings(new_codec_embed)
+    model.llm.model.to(device)
+    model.llm.model.to(dtype)
+    tmp_vocab_size = model.llm.model.config.vocab_size
+    tmp_tie_embedding = model.llm.model.config.tie_word_embeddings
+    del model.llm.model.generation_config.eos_token_id
+    del model.llm.model.config.bos_token_id
+    del model.llm.model.config.eos_token_id
+    model.llm.model.config.vocab_size = pad_vocab_size
+    model.llm.model.config.tie_word_embeddings = False
+    model.llm.model.config.use_bias = True
+    model.llm.model.save_pretrained(model_path)
+    os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
+    model.llm.model.config.vocab_size = tmp_vocab_size
+    model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
+    model.llm.model.set_input_embeddings(embed_tokens)

xinference/thirdparty/cosyvoice/utils/losses.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn.functional as F
+from typing import Tuple
 def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
@@ -18,3 +19,39 @@ def mel_loss(real_speech, generated_speech, mel_transforms):
         mel_g = transform(generated_speech)
         loss += F.l1_loss(mel_g, mel_r)
     return loss
+class DPOLoss(torch.nn.Module):
+    """
+    DPO Loss
+    """
+    def __init__(self, beta: float, label_smoothing: float = 0.0, ipo: bool = False) -> None:
+        super().__init__()
+        self.beta = beta
+        self.label_smoothing = label_smoothing
+        self.ipo = ipo
+    def forward(
+        self,
+        policy_chosen_logps: torch.Tensor,
+        policy_rejected_logps: torch.Tensor,
+        reference_chosen_logps: torch.Tensor,
+        reference_rejected_logps: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        ref_logratios = reference_chosen_logps - reference_rejected_logps
+        logits = pi_logratios - ref_logratios
+        if self.ipo:
+            losses = (logits - 1 / (2 * self.beta)) ** 2  # Eq. 17 of https://arxiv.org/pdf/2310.12036v2.pdf
+        else:
+            # Eq. 3 https://ericmitchell.ai/cdpo.pdf; label_smoothing=0 gives original DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        loss = losses.mean()
+        chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
+        rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
+        return loss, chosen_rewards, rejected_rewards

xinference/thirdparty/cosyvoice/utils/mask.py CHANGED Viewed

@@ -86,7 +86,7 @@ def subsequent_mask(
     return mask
-def subsequent_chunk_mask(
+def subsequent_chunk_mask_deprecated(
         size: int,
         chunk_size: int,
         num_left_chunks: int = -1,
@@ -124,6 +124,40 @@ def subsequent_chunk_mask(
     return ret
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
 def add_optional_chunk_mask(xs: torch.Tensor,
                             masks: torch.Tensor,
                             use_dynamic_chunk: bool,

xinference/thirdparty/cosyvoice/utils/train_utils.py CHANGED Viewed

@@ -50,10 +50,10 @@ def init_distributed(args):
     return world_size, local_rank, rank
-def init_dataset_and_dataloader(args, configs, gan):
+def init_dataset_and_dataloader(args, configs, gan, dpo):
     data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
-    train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=True, partition=True)
-    cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=False, partition=False)
+    train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True)
+    cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False)
     # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
     train_data_loader = DataLoader(train_dataset,
@@ -71,7 +71,7 @@ def init_dataset_and_dataloader(args, configs, gan):
 def check_modify_and_save_config(args, configs):
     if args.train_engine == "torch_ddp":
-        configs['train_conf']["dtype"] = 'fp32'
+        configs['train_conf']["dtype"] = 'bf16' if args.use_amp is True else 'fp32'
     else:
         with open(args.deepspeed_config, 'r') as fin:
             ds_configs = json.load(fin)
@@ -235,7 +235,7 @@ def cosyvoice_join(group_join, info_dict):
         return False
-def batch_forward(model, batch, scaler, info_dict):
+def batch_forward(model, batch, scaler, info_dict, ref_model=None, dpo_loss=None):
     device = int(os.environ.get('LOCAL_RANK', 0))
     dtype = info_dict["dtype"]
@@ -247,12 +247,30 @@ def batch_forward(model, batch, scaler, info_dict):
         dtype = torch.float32
     if info_dict['train_engine'] == 'torch_ddp':
-        autocast = torch.cuda.amp.autocast(enabled=scaler is not None)
+        autocast = torch.cuda.amp.autocast(enabled=scaler is not None, dtype=dtype)
     else:
         autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False)
     with autocast:
         info_dict['loss_dict'] = model(batch, device)
+        if ref_model is not None and dpo_loss is not None:
+            chosen_logps = info_dict['loss_dict']["chosen_logps"]
+            rejected_logps = info_dict['loss_dict']["rejected_logps"]
+            sft_loss = info_dict['loss_dict']['loss']
+            with torch.no_grad():
+                ref_loss_dict = ref_model(batch, device)
+            reference_chosen_logps = ref_loss_dict["chosen_logps"]
+            reference_rejected_logps = ref_loss_dict["rejected_logps"]
+            preference_loss, chosen_reward, reject_reward = dpo_loss(
+                chosen_logps, rejected_logps, reference_chosen_logps, reference_rejected_logps
+            )
+            dpo_acc = (chosen_reward > reject_reward).float().mean()
+            info_dict['loss_dict']["loss"] = preference_loss + sft_loss
+            info_dict['loss_dict']["sft_loss"] = sft_loss
+            info_dict['loss_dict']["dpo_loss"] = preference_loss
+            info_dict['loss_dict']["dpo_acc"] = dpo_acc
+            info_dict['loss_dict']["chosen_reward"] = chosen_reward.mean()
+            info_dict['loss_dict']["reject_reward"] = reject_reward.mean()
     return info_dict

xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py ADDED Viewed

@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2 model compatible with HuggingFace weights."""
+from vllm.model_executor.models.qwen2 import *
+class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              True,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, self.lm_head.bias)
+        return logits
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)

xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl