PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show

helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py ADDED Viewed

@@ -0,0 +1,4308 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen2_5_omni.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import operator
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import ConvTranspose1d, Parameter
+from helm.common.optional_dependencies import handle_module_not_found_error
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache, SlidingWindowCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+try:
+    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["audiolm"])
+from transformers.utils import (
+    add_start_docstrings,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from transformers.utils.hub import cached_file
+from helm.clients.audio_language.qwen_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniAudioEncoderConfig,
+    Qwen2_5OmniBigVGANConfig,
+    Qwen2_5OmniConfig,
+    Qwen2_5OmniDiTConfig,
+    Qwen2_5OmniTalkerConfig,
+    Qwen2_5OmniTextConfig,
+    Qwen2_5OmniThinkerConfig,
+    Qwen2_5OmniToken2WavConfig,
+    Qwen2_5OmniVisionEncoderConfig,
+)
+if is_flash_attn_2_available():
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb
+else:
+    flash_attn_varlen_func = None
+    apply_rotary_emb = None
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+logger = logging.get_logger(__name__)
+# @add_start_docstrings(
+#     "The bare Qwen2.5Omni Model outputting raw hidden-states without any specific head on top.",
+#     QWEN2_5OMNI_START_DOCSTRING.format(config_class="Qwen2_5OmniConfig"),
+# )
+class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
+    config_class: Any = Qwen2_5OmniConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        # important: this ported version of Qwen2.5OmniThinker isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed
+        std = self.config.init_std if hasattr(self.config, "init_std") else 0.02
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        self,
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        min_dtype: float,
+        cache_position: torch.Tensor,
+        batch_size: int,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_llm_pos_ids_for_vision(
+        self,
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: torch.Tensor,
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ):
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(len(t_index), -1, llm_grid_w).flatten()
+        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(len(t_index), llm_grid_h, -1).flatten()
+        t_index_p = torch.Tensor(t_index).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten().long()
+        _llm_pos_ids = torch.stack([t_index_p, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)  # + 1 ) # 12.09 by malinhan
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+    def get_chunked_index(self, llm_pos_ids, t_ntoken_per_chunk, st_idx):
+        def _iter():
+            i, start_idx = 0, 0  # skip bos token
+            current_chunk = 1
+            while i < llm_pos_ids.shape[1]:  # skip eos token
+                if llm_pos_ids[0][i] - st_idx >= current_chunk * t_ntoken_per_chunk:
+                    yield (start_idx, i)
+                    start_idx = i
+                    current_chunk += 1
+                i += 1
+            yield (start_idx, llm_pos_ids.shape[1])
+        return list(_iter())
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        use_audio_in_video: Optional[bool] = False,
+        audio_seqlens: Optional[torch.Tensor] = None,
+        second_per_grids: Optional[torch.Tensor] = None,
+    ):
+        spatial_merge_size = self.spatial_merge_size
+        image_token_id = self.config.image_token_index
+        video_token_id = self.config.video_token_index
+        audio_token_id = self.config.audio_token_index
+        vision_start_token_id = self.config.vision_start_token_id
+        audio_start_token_id = self.config.audio_start_token_id
+        position_id_per_seconds = self.config.position_id_per_seconds
+        seconds_per_chunk = self.config.seconds_per_chunk
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_idx, video_idx, audio_idx = 0, 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids_p in enumerate(total_input_ids):
+                input_ids_p = input_ids_p[attention_mask[i] == 1]
+                image_nums, video_nums, audio_nums = 0, 0, 0
+                vision_start_indices = torch.argwhere(input_ids_p == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids_p[vision_start_indices + 1]
+                audio_nums = int(torch.sum(input_ids_p == audio_start_token_id).item())
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (
+                    (vision_tokens == audio_start_token_id).sum()
+                    if use_audio_in_video
+                    else (vision_tokens == video_token_id).sum()
+                )
+                input_tokens = input_ids_p.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums
+                multimodal_nums = (
+                    image_nums + audio_nums if use_audio_in_video else image_nums + video_nums + audio_nums
+                )
+                for _ in range(multimodal_nums):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if audio_token_id in input_tokens and remain_audios > 0:
+                        ed_audio = input_tokens.index(audio_token_id, st)
+                    else:
+                        ed_audio = len(input_tokens) + 1
+                    min_ed = min(ed_image, ed_video, ed_audio)
+                    if min_ed == ed_audio and audio_seqlens is not None:
+                        text_len = min_ed - st - 1
+                        if text_len != 0:
+                            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        bos_len = 1
+                        llm_pos_ids_list.append(torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        audio_len = ((audio_seqlens[audio_idx] - 1) // 2 + 1 - 2) // 2 + 1
+                        llm_pos_ids = torch.arange(audio_len).view(1, -1).expand(3, -1) + st_idx
+                        llm_pos_ids_list.append(llm_pos_ids)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        eos_len = 1
+                        llm_pos_ids_list.append(torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st += text_len + bos_len + audio_len + eos_len
+                        audio_idx += 1
+                        remain_audios -= 1
+                    elif min_ed == ed_image and image_grid_thw is not None:
+                        text_len = min_ed - st - 1
+                        if text_len != 0:
+                            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        bos_len = 1
+                        llm_pos_ids_list.append(torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        grid_t = image_grid_thw[image_idx][0]
+                        grid_hs = image_grid_thw[:, 1]
+                        grid_ws = image_grid_thw[:, 2]
+                        t_index = (torch.arange(grid_t.item()) * 1 * position_id_per_seconds).long()
+                        llm_pos_ids = self.get_llm_pos_ids_for_vision(
+                            st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                        )
+                        image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
+                        llm_pos_ids_list.append(llm_pos_ids)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        eos_len = 1
+                        llm_pos_ids_list.append(torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st += text_len + bos_len + image_len + eos_len
+                        image_idx += 1
+                        remain_images -= 1
+                    elif (
+                        min_ed == ed_video
+                        and not use_audio_in_video
+                        and video_grid_thw is not None
+                        and second_per_grids is not None
+                    ):
+                        text_len = min_ed - st - 1
+                        if text_len != 0:
+                            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        bos_len = 1
+                        llm_pos_ids_list.append(torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        grid_t = video_grid_thw[video_idx][0]
+                        grid_hs = video_grid_thw[:, 1]
+                        grid_ws = video_grid_thw[:, 2]
+                        t_index = (
+                            torch.arange(grid_t.item())
+                            * second_per_grids[video_idx].cpu().float()
+                            * position_id_per_seconds
+                        ).long()
+                        llm_pos_ids = self.get_llm_pos_ids_for_vision(
+                            st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                        )
+                        video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                        llm_pos_ids_list.append(llm_pos_ids)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        eos_len = 1
+                        llm_pos_ids_list.append(torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st += text_len + bos_len + video_len + eos_len
+                        video_idx += 1
+                        remain_videos -= 1
+                    elif (
+                        min_ed == ed_video
+                        and use_audio_in_video
+                        and audio_seqlens is not None
+                        and video_grid_thw is not None
+                        and second_per_grids is not None
+                    ):
+                        text_len = min_ed - st - 2
+                        if text_len != 0:
+                            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        bos_len = 1
+                        llm_pos_ids_list.append(torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx)
+                        llm_pos_ids_list.append(torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        audio_len = ((audio_seqlens[audio_idx] - 1) // 2 + 1 - 2) // 2 + 1
+                        audio_llm_pos_ids = torch.arange(audio_len).view(1, -1).expand(3, -1) + st_idx
+                        grid_t = video_grid_thw[video_idx][0]
+                        grid_hs = video_grid_thw[:, 1]
+                        grid_ws = video_grid_thw[:, 2]
+                        t_index = (
+                            torch.arange(grid_t.item())
+                            * second_per_grids[video_idx].cpu().float()
+                            * position_id_per_seconds
+                        ).long()
+                        video_llm_pos_ids = self.get_llm_pos_ids_for_vision(
+                            st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                        )
+                        t_ntoken_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
+                        video_chunk_indexes = self.get_chunked_index(video_llm_pos_ids, t_ntoken_per_chunk, st_idx)
+                        audio_chunk_indexes = self.get_chunked_index(audio_llm_pos_ids, t_ntoken_per_chunk, st_idx)
+                        sub_len = 0
+                        for j in range(max(len(video_chunk_indexes), len(audio_chunk_indexes))):
+                            video_chunk_index = video_chunk_indexes[j] if j < len(video_chunk_indexes) else None
+                            audio_chunk_index = audio_chunk_indexes[j] if j < len(audio_chunk_indexes) else None
+                            if video_chunk_index is not None:
+                                sub_len += video_chunk_index[1] - video_chunk_index[0]
+                                llm_pos_ids_list.append(
+                                    video_llm_pos_ids[:, video_chunk_index[0] : video_chunk_index[1]]
+                                )
+                            if audio_chunk_index is not None:
+                                sub_len += audio_chunk_index[1] - audio_chunk_index[0]
+                                llm_pos_ids_list.append(
+                                    audio_llm_pos_ids[:, audio_chunk_index[0] : audio_chunk_index[1]]
+                                )
+                        video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                        st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                        eos_len = 1
+                        llm_pos_ids_list.append(torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx)
+                        llm_pos_ids_list.append(torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx)
+                        st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2
+                        audio_idx += 1
+                        video_idx += 1
+                        remain_videos -= 1
+                        remain_audios -= 1
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(input_ids_p))
+            mrope_position_deltas_p = torch.tensor(mrope_position_deltas, device=input_ids_p.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas_p
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas_p = max_position_ids + 1 - torch.sum(attention_mask, dim=-1, keepdim=True)
+            return position_ids, mrope_position_deltas_p
+@dataclass
+class Qwen2_5OmniThinkerCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Any] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attention_mask: Optional[torch.Tensor] = None
+    rope_deltas: Optional[torch.Tensor] = None
+class Qwen2_5OmniAudioAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[Qwen2_5OmniThinkerConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        seq_length, _ = hidden_states.size()
+        # get query proj
+        # query_states = self.q_proj(hidden_states)
+        query_states = (hidden_states @ self.q_proj.weight.t()) + self.q_proj.bias
+        query_states = query_states.reshape(seq_length, self.num_heads, -1)
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self.k_proj(current_states).reshape(seq_length, self.num_heads, -1)
+            value_states = self.v_proj(current_states).reshape(seq_length, self.num_heads, -1)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attention_mask = torch.full(
+            [1, seq_length, key_states.shape[1]],
+            torch.finfo(query_states.dtype).min,
+            device=query_states.device,
+            dtype=query_states.dtype,
+        )
+        assert cu_seqlens is not None
+        for i in range(1, cu_seqlens.size(0)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1).to(query_states.dtype)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights
+        attn_output = torch.matmul(attn_weights, value_states).transpose(0, 1).reshape(seq_length, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights, past_key_value
+class Qwen2_5OmniAudioFlashAttention2(Qwen2_5OmniAudioAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
+                "Use `attn_implementation='sdpa'` in the meantime, and open an issue "
+                "at https://github.com/huggingface/transformers"
+            )
+        # Qwen2.5OmniThinkerFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("Qwen2.5OmniThinkerFlashAttention2 attention does not support output_attentions")
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        seq_length, all_dim = hidden_states.size()
+        query_states = (hidden_states @ self.q_proj.weight.t()) + (
+            self.q_proj.bias if self.q_proj.bias is not None else 0
+        )
+        query_states = query_states.reshape(seq_length, self.num_heads, -1)
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = (current_states @ self.k_proj.weight.t()) + (
+                self.k_proj.bias if self.k_proj.bias is not None else 0
+            )
+            key_states = key_states.reshape(seq_length, self.num_heads, -1)
+            value_states = (current_states @ self.v_proj.weight.t()) + (
+                self.v_proj.bias if self.v_proj.bias is not None else 0
+            )
+            value_states = value_states.reshape(seq_length, self.num_heads, -1)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+        assert cu_seqlens is not None
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(
+            query_states, key_states, value_states, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, dropout_p=0.0
+        )
+        attn_output = attn_output.reshape(seq_length, all_dim)
+        attn_output = (attn_output @ self.out_proj.weight.t()) + (
+            self.out_proj.bias if self.out_proj.bias is not None else 0
+        )
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2_5OmniAudioSdpaAttention(Qwen2_5OmniAudioAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            logger.warning_once(
+                "Qwen2_5OmniThinkerModel is using Qwen2_5OmniThinkerSdpaAttention, but "
+                "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` "
+                "or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required "'
+                '"from Transformers version v5.0.0 onwards. This warning can be removed using the argument"'
+                '" `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                cu_seqlens=cu_seqlens,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        seq_length, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self.k_proj(current_states).reshape(seq_length, self.num_heads, -1)
+            value_states = self.v_proj(current_states).reshape(seq_length, self.num_heads, -1)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+        attention_mask = torch.zeros([1, seq_length, key_states.shape[0]], device=query_states.device, dtype=torch.bool)
+        assert cu_seqlens is not None
+        for i in range(1, cu_seqlens.size(0)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+        )
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None, past_key_value
+QWEN2_5_OMNI_AUDIO_ATTENTION_CLASSES = {
+    "eager": Qwen2_5OmniAudioAttention,
+    "flash_attention_2": Qwen2_5OmniAudioFlashAttention2,
+    "sdpa": Qwen2_5OmniAudioSdpaAttention,
+}
+class Qwen2_5OmniAudioEncoderLayer(nn.Module):
+    def __init__(self, config: Qwen2_5OmniAudioEncoderConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = QWEN2_5_OMNI_AUDIO_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = (hidden_states @ self.fc1.weight.t()) + (self.fc1.bias if self.fc1.bias is not None else 0)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = (hidden_states @ self.fc2.weight.t()) + (self.fc2.bias if self.fc2.bias is not None else 0)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs: Tuple[Any, ...]
+        outputs = (hidden_states,)
+        if output_attentions and attn_weights is not None:
+            outputs += (attn_weights,)
+        return outputs
+class SinusoidsPositionEmbedding(nn.Module):
+    def __init__(self, length, channels, max_timescale=10000):
+        super().__init__()
+        assert channels % 2 == 0
+        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+        scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+        self.register_buffer(
+            "positional_embedding",
+            torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
+            persistent=False,
+        )
+    def forward(self, seqlen: int):
+        return self.positional_embedding[:seqlen, :]
+class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self
+    attention layers. Each layer is a [`Qwen2_5OmniAudioEncoderLayer`].
+    Args:
+        config: Qwen2_5OmniAudioEncoderConfig
+    """
+    config_class = Qwen2_5OmniAudioEncoderConfig
+    main_input_name = "input_features"
+    _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"]
+    _supports_sdpa = True
+    def __init__(self, config: Qwen2_5OmniAudioEncoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.n_window = config.n_window
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+        self.positional_embedding = SinusoidsPositionEmbedding(self.max_source_positions, embed_dim)
+        self.audio_bos_eos_token = nn.Embedding(2, config.output_dim)
+        self.layers = nn.ModuleList([Qwen2_5OmniAudioEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.ln_post = nn.LayerNorm(config.d_model)
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+        self.proj = nn.Linear(config.d_model, config.output_dim)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+    def set_input_embeddings(self, value):
+        self.conv1 = value
+    def forward(
+        self,
+        input_features,
+        feature_lens=None,
+        aftercnn_lens=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
+        chunk_lengths = torch.tensor(
+            [self.n_window * 2] * chunk_num.sum(),
+            dtype=torch.long,
+            device=feature_lens.device,
+        )
+        tail_chunk_index = list(accumulate(chunk_num.tolist(), func=operator.add, initial=-1))[1:]
+        chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
+        chunk_lengths = torch.where(chunk_lengths == 0, self.n_window * 2, chunk_lengths)
+        chunk_list = input_features.split(chunk_lengths.tolist(), dim=1)
+        padded_feature, padded_mask, padded_mask_after_cnn = self.padded_and_mask_function(
+            chunk_list, chunk_lengths, padding_value=0, padding_side="right"
+        )
+        padded_embed = nn.functional.gelu(self.conv1(padded_feature)) * padded_mask
+        padded_embed = nn.functional.gelu(self.conv2(padded_embed)).transpose(1, 2)
+        padded_embed = padded_embed + self.positional_embedding.positional_embedding[
+            : padded_embed.shape[1], :
+        ].unsqueeze(0).to(padded_embed.dtype)
+        hidden_states = padded_embed[padded_mask_after_cnn]
+        cu_seqlens = torch.cat(
+            (
+                torch.zeros(1, device=padded_mask_after_cnn.device, dtype=torch.int32),
+                padded_mask_after_cnn.sum(1).cumsum(0),
+            )
+        ).to(torch.int32)
+        encoder_states: Optional[Tuple[Any, ...]] = () if output_hidden_states else None
+        all_attentions: Optional[Tuple[Any, ...]] = () if output_attentions else None
+        tmp_hidden_states = []
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None and head_mask.size()[0] != (len(self.layers)):
+            raise ValueError(
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states and encoder_states is not None and hidden_states is not None:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        cu_seqlens,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        cu_seqlens,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+                tmp_hidden_states.append(hidden_states)
+            if output_attentions and all_attentions is not None and layer_outputs is not None:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        hidden_states_list = hidden_states.split(aftercnn_lens.tolist(), dim=0)
+        token_audio_list = []
+        for each_audio_states in hidden_states_list:
+            each_audio_states = self.avg_pooler(each_audio_states.transpose(0, 1)).transpose_(0, 1)
+            each_audio_states = self.ln_post(each_audio_states)
+            each_audio_states = self.proj(each_audio_states)
+            token_audio_list.append(each_audio_states)
+        token_audio = torch.cat(token_audio_list, dim=0)
+        if output_hidden_states and encoder_states is not None and token_audio is not None:
+            encoder_states = encoder_states + (token_audio,)
+        if not return_dict:
+            return tuple(v for v in [token_audio, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=token_audio, hidden_states=encoder_states, attentions=all_attentions)
+    def padded_and_mask_function(self, tensor_list, tensor_len, padding_value=0, padding_side="right"):
+        max_len = tensor_len.max()
+        dim = tensor_list[0].shape[0]
+        padded_tensor = torch.full(
+            size=(len(tensor_list), dim, max_len),
+            fill_value=padding_value,
+            dtype=tensor_list[0].dtype,
+            device=tensor_list[0].device,
+        )
+        batch_mask = torch.zeros(
+            (len(tensor_len), max_len),
+            dtype=torch.long,
+            device=padded_tensor.device,
+        )
+        for i, length in enumerate(tensor_len):
+            batch_mask[i, :length] = 1
+            padded_tensor[i, :, :length] = tensor_list[i]
+        feature_lens_after_cnn = (tensor_len - 1) // 2 + 1
+        max_len_after_cnn = feature_lens_after_cnn.max()
+        batch_mask_after_cnn = torch.zeros(
+            (len(tensor_len), max_len_after_cnn),
+            dtype=torch.long,
+            device=padded_tensor.device,
+        )
+        for i, length in enumerate(feature_lens_after_cnn):
+            batch_mask_after_cnn[i, :length] = 1
+        return (
+            padded_tensor,
+            batch_mask.unsqueeze(1),
+            batch_mask_after_cnn.bool(),
+        )
+    # Ignore copy
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class Qwen2_5OmniVisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, bias=True)
+        self.k = nn.Linear(dim, dim, bias=True)
+        self.v = nn.Linear(dim, dim, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q = self.q(hidden_states).reshape(seq_length, self.num_heads, -1)
+        k = self.k(hidden_states).reshape(seq_length, self.num_heads, -1)
+        v = self.v(hidden_states).reshape(seq_length, self.num_heads, -1)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen2_5OmniVisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.q = nn.Linear(dim, dim, bias=True)
+        self.k = nn.Linear(dim, dim, bias=True)
+        self.v = nn.Linear(dim, dim, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def _apply_rotary_pos_emb_flashatt(self, tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+        tensor_ = tensor.float()
+        cos = freqs.cos()  # .type_as(tensor_)
+        sin = freqs.sin()  # .type_as(tensor_)
+        output = apply_rotary_emb(tensor_, cos, sin).type_as(tensor)
+        return output
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q = self.q(hidden_states).reshape(seq_length, self.num_heads, -1)
+        k = self.k(hidden_states).reshape(seq_length, self.num_heads, -1)
+        v = self.v(hidden_states).reshape(seq_length, self.num_heads, -1)
+        q = self._apply_rotary_pos_emb_flashatt(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = self._apply_rotary_pos_emb_flashatt(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen2_5OmniVisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.q = nn.Linear(dim, dim, bias=True)
+        self.k = nn.Linear(dim, dim, bias=True)
+        self.v = nn.Linear(dim, dim, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q = self.q(hidden_states).reshape(seq_length, self.num_heads, -1)
+        k = self.k(hidden_states).reshape(seq_length, self.num_heads, -1)
+        v = self.v(hidden_states).reshape(seq_length, self.num_heads, -1)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen2_5OmniMLP(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+QWEN2_5_OMNI_VISION_ATTENTION_CLASSES = {
+    "eager": Qwen2_5OmniVisionAttention,
+    "flash_attention_2": Qwen2_5OmniVisionFlashAttention2,
+    "sdpa": Qwen2_5OmniVisionSdpaAttention,
+}
+class Qwen2_5OmniVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.attn = QWEN2_5_OMNI_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.hidden_size, num_heads=config.num_heads
+        )
+        self.mlp = Qwen2_5OmniMLP(config, bias=True)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Qwen2_5OmniPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniVisionEncoderConfig
+    _no_split_modules = ["Qwen2_5OmniVisionBlock"]
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+        self.window_size = config.window_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.hidden_size,
+        )
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2_5OmniVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = Qwen2_5OmniPatchMerger(
+            dim=config.out_hidden_size,
+            context_dim=config.hidden_size,
+            spatial_merge_size=config.spatial_merge_size,
+        )
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids_p = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids_p].flatten(1)
+        return rotary_pos_emb
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index_p = torch.cat(window_index, dim=0)
+        return window_index_p, cu_window_seqlens
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        # Modification here
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__, hidden_states, cu_seqlens_now, rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens_now,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.gate_proj.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.gate_proj.weight.device
+class Qwen2_5OmniRotaryEmbedding(nn.Module):
+    def __init__(self, config: Qwen2_5OmniThinkerConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block. In contrast to other models, Qwen2_5Omni has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2_5OmniAttention(nn.Module):
+    def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+        # if (self.head_dim * self.num_heads) != self.hidden_size:
+        #     raise ValueError(
+        #         f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+        #         f" and `num_heads`: {self.num_heads})."
+        #     )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        assert position_embeddings is not None
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # Fix precision issues in Qwen2-VL float16 inference
+        # Replace inf values with zeros in attention weights to prevent NaN propagation
+        if query_states.dtype == torch.float16:
+            attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights_p: torch.Tensor = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
+        return_attn_weights: Optional[torch.Tensor] = attn_weights_p
+        attn_output = torch.matmul(attn_weights_p, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            return_attn_weights = None
+        return attn_output, return_attn_weights, past_key_value
+class Qwen2MLP(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class Qwen2_5OmniFlashAttention2(Qwen2_5OmniAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        assert position_embeddings is not None
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2_5OmniSdpaAttention(Qwen2_5OmniAttention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once(
+                "Qwen2_5OmniModel is using Qwen2_5OmniSdpaAttention, but "
+                "`torch.nn.functional.scaled_dot_product_attention`"
+                " does not support `output_attentions=True`. Falling back to "
+                "the manual attention implementation, "
+                "but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards."
+                ' This warning can be removed using the argument "'
+                '"`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        assert position_embeddings is not None
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+QWEN2_5_OMNI_ATTENTION_CLASSES = {
+    "eager": Qwen2_5OmniAttention,
+    "flash_attention_2": Qwen2_5OmniFlashAttention2,
+    "sdpa": Qwen2_5OmniSdpaAttention,
+}
+class Qwen2_5OmniDecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2_5OmniConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for "
+                f"`{config._attn_implementation}`; "
+                f"unexpected results may be encountered."
+            )
+        self.self_attn = QWEN2_5_OMNI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs: Tuple[Any, ...]
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+QWEN2_5OMNI_START_DOCSTRING = r"""add doc"""
+@add_start_docstrings(
+    "The bare Qwen2.5OmniThinker Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_5OMNI_START_DOCSTRING.format(config_class="Qwen2_5OmniTextConfig"),
+)
+class Qwen2_5OmniThinkerModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniTextConfig
+    _no_split_modules = ["Qwen2_5OmniDecoderLayer"]
+    def __init__(self, config: Qwen2_5OmniTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2_5OmniDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values=None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states: Optional[Tuple[Any, ...]] = () if output_hidden_states else None
+        all_self_attns: Optional[Tuple[Any, ...]] = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states and hidden_states is not None and all_hidden_states is not None:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions and layer_outputs is not None and all_self_attns is not None:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states and all_hidden_states is not None:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version "
+                        "of Qwen25OmniThinker. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen2_5OmniConfig,
+        past_key_values: Cache,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+@add_start_docstrings(
+    """The Qwen2.5OmniThinker model which consists of a audio backbone and a language model.""",
+    QWEN2_5OMNI_START_DOCSTRING.format(config_class="Qwen2_5OmniThinkerConfig"),
+)
+class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin):
+    config_class = Qwen2_5OmniThinkerConfig
+    _no_split_modules = ["Qwen2_5OmniAudioEncoder", "Qwen2_5OmniVisionEncoder"]
+    def __init__(self, config: Qwen2_5OmniThinkerConfig):
+        super().__init__(config)
+        self.audio_tower = Qwen2_5OmniAudioEncoder._from_config(
+            config.audio_config, attn_implementation=config._attn_implementation
+        )
+        self.visual = Qwen2_5OmniVisionEncoder._from_config(
+            config.vision_config, attn_implementation=config._attn_implementation
+        )
+        self.vocab_size = config.text_config.vocab_size
+        self.model = Qwen2_5OmniThinkerModel._from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.spatial_merge_size = config.vision_config.spatial_merge_size
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        feature_attention_mask: Optional[torch.Tensor] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        rope_deltas: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_audio_in_video: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        video_second_per_grid: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, Qwen2_5OmniThinkerCausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if feature_attention_mask is not None and input_features is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+            input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+        else:
+            audio_feature_lengths = None
+        if attention_mask is not None and position_ids is None:
+            if cache_position is None or (cache_position is not None and cache_position[0] == 0):
+                delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
+                position_ids_p, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask,
+                    use_audio_in_video,
+                    audio_feature_lengths,
+                    video_second_per_grid,
+                )
+                rope_deltas = rope_deltas - delta0
+            else:
+                assert input_ids is not None
+                batch_size, seq_length = input_ids.shape
+                delta = (
+                    cache_position[0] + rope_deltas
+                    if cache_position is not None and rope_deltas is not None
+                    else torch.tensor(0, device=input_ids.device)
+                )
+                position_ids = torch.arange(seq_length, device=input_ids.device)
+                position_ids_p = position_ids.view(1, -1).expand(batch_size, -1)
+                position_ids_p = position_ids_p.add(delta)
+                position_ids_p = position_ids_p.unsqueeze(0).expand(3, -1, -1)
+        if inputs_embeds is None and input_ids is not None:
+            # 1. Extract the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            embeds_to_talker = inputs_embeds.clone()
+            # 2. Merge text , audios , image and video
+            if input_ids.shape[1] != 1:
+                if input_features is not None and feature_attention_mask is not None:
+                    audio_feat_lengths, audio_output_lengths = self.audio_tower._get_feat_extract_output_lengths(
+                        audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+                    )
+                    feature_lens = (
+                        audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+                    )
+                    audio_outputs = self.audio_tower(
+                        input_features,
+                        feature_lens=feature_lens,
+                        aftercnn_lens=audio_feat_lengths,
+                    )
+                    audio_features = audio_outputs.last_hidden_state
+                    if audio_features.shape[0] != sum(audio_output_lengths.tolist()):
+                        raise ValueError("length of audio_features should match audio_output_lengths")
+                    audio_mask = (input_ids == self.config.audio_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_features)
+                    embeds_to_talker = embeds_to_talker.masked_scatter(audio_mask, torch.zeros_like(audio_features))
+                if pixel_values is not None:
+                    pixel_values = pixel_values.type(self.visual.get_dtype())
+                    image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                    image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+                    embeds_to_talker = embeds_to_talker.masked_scatter(image_mask, torch.zeros_like(image_embeds))
+                if pixel_values_videos is not None:
+                    pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
+                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                    video_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+                    embeds_to_talker = embeds_to_talker.masked_scatter(video_mask, torch.zeros_like(video_embeds))
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model(
+            attention_mask=attention_mask,
+            position_ids=position_ids_p,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device))
+        if not return_dict:
+            output = (logits,) + ((embeds_to_talker, outputs[0])) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return Qwen2_5OmniThinkerCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=(embeds_to_talker, outputs.hidden_states),
+            attentions=outputs.attentions,
+            attention_mask=attention_mask,
+            rope_deltas=rope_deltas,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        input_features=None,
+        feature_attention_mask=None,
+        use_audio_in_video=False,
+        video_second_per_grid=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            input_features=input_features,
+            feature_attention_mask=feature_attention_mask,
+            use_audio_in_video=use_audio_in_video,
+            video_second_per_grid=video_second_per_grid,
+            **kwargs,
+        )
+        model_inputs["position_ids"] = None
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        # update attention_mask
+        if getattr(outputs, "attention_mask", None) is not None:
+            model_kwargs["attention_mask"] = outputs.attention_mask
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder, num_new_tokens
+        )
+        if getattr(outputs, "rope_deltas", None) is not None:
+            model_kwargs["rope_deltas"] = outputs.rope_deltas
+        return model_kwargs
+@dataclass
+class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attention_mask: Optional[torch.Tensor] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+    thinker_reply_part: Optional[torch.Tensor] = None
+@add_start_docstrings(
+    "The bare Qwen2.5OmniTalker Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_5OMNI_START_DOCSTRING.format(config_class="Qwen2_5OmniTalkerConfig"),
+)
+class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniTalkerConfig
+    _no_split_modules = ["Qwen2_5OmniTalkerDecoderLayer"]
+    def __init__(self, config: Qwen2_5OmniTalkerConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embedding_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2_5OmniDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Any] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens: Any
+            if past_key_values is not None:
+                past_seen_tokens = past_key_values.get_seq_length()
+            else:
+                past_seen_tokens = 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None and cache_position is not None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        assert attention_mask is not None and cache_position is not None
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = () if output_hidden_states else None
+        all_self_attns: Optional[Tuple[torch.Tensor, ...]] = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states and all_hidden_states is not None and hidden_states is not None:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions and all_self_attns is not None and layer_outputs is not None:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states and all_hidden_states is not None and hidden_states is not None:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version "
+                        "of Qwen25OmniTalker. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen2_5OmniConfig,
+        past_key_values: Cache,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin):
+    config_class = Qwen2_5OmniTalkerConfig
+    def __init__(self, config: Qwen2_5OmniTalkerConfig):
+        super().__init__(config)
+        self.thinker_to_talker_proj = nn.Linear(config.embedding_size, config.hidden_size)
+        self.model = Qwen2_5OmniTalkerModel(config)
+        self.codebook_size = config.vocab_size
+        self.codec_head = nn.Linear(config.hidden_size, self.codebook_size, bias=False)
+        self.codec_bos_token = config.tts_codec_start_token_id
+        self.codec_eos_token = config.tts_codec_end_token_id
+        self.codec_pad_token = config.tts_codec_pad_token_id
+        self.codec_mask_token = config.tts_codec_mask_token_id
+        self.text_bos_token = config.tts_text_start_token_id
+        self.text_eos_token = config.tts_text_end_token_id
+        self.text_pad_token = config.tts_text_pad_token_id
+        self.spatial_merge_size = self.config.spatial_merge_size
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        thinker_reply_part: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        input_text_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        use_audio_in_video: Optional[bool] = None,
+        audio_feature_lengths: Optional[torch.LongTensor] = None,
+        video_second_per_grid: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Qwen2_5OmniTalkerCausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None and position_ids is None:
+            if cache_position is None or (cache_position is not None and cache_position[0] == 0):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_text_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask,
+                    use_audio_in_video,
+                    audio_feature_lengths,
+                    video_second_per_grid,
+                )
+                assert inputs_embeds is not None
+                inputs_embeds[:, -1, :] += self.get_input_embeddings()(
+                    torch.tensor([self.codec_bos_token], dtype=torch.long, device=inputs_embeds.device)
+                )
+                inputs_embeds[:, -2, :] += self.get_input_embeddings()(
+                    torch.tensor([self.codec_pad_token], dtype=torch.long, device=inputs_embeds.device)
+                )
+            else:
+                assert input_ids is not None
+                batch_size, seq_length = input_ids.shape
+                delta = (
+                    cache_position[0] + rope_deltas
+                    if cache_position is not None and rope_deltas is not None
+                    else torch.tensor(0, device=input_ids.device)
+                )
+                position_ids = torch.arange(seq_length, device=input_ids.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        if inputs_embeds is None:
+            assert thinker_reply_part is not None
+            # 1. 推理第 2 个以及之后的 token
+            codec_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds = codec_embeds + thinker_reply_part[:, :1, :]
+            if thinker_reply_part.shape[1] > 1:
+                thinker_reply_part = thinker_reply_part[:, 1:, :]
+        talker_lm_input = self.thinker_to_talker_proj(inputs_embeds)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=talker_lm_input,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.codec_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return Qwen2_5OmniTalkerCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            attention_mask=attention_mask,
+            rope_deltas=rope_deltas,
+            thinker_reply_part=thinker_reply_part,
+        )
+    def _get_initial_cache_position(self, input_ids, model_kwargs):
+        # Talker needs to calculate cache_position with input_ids, so pop inputs_embeds temporarily
+        inputs_embeds = model_kwargs.pop("inputs_embeds")
+        model_kwargs = super()._get_initial_cache_position(input_ids, model_kwargs)
+        model_kwargs["inputs_embeds"] = inputs_embeds
+        return model_kwargs
+    # prepare inputs for talker lm generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        input_text_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        thinker_reply_part=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        input_audio_features=None,
+        audio_feature_attention_mask=None,
+        audio_feature_lengths=None,
+        use_audio_in_video=False,
+        video_second_per_grid=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values,
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            use_cache=use_cache,
+            thinker_reply_part=thinker_reply_part,
+            input_text_ids=input_text_ids,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_audio_in_video=use_audio_in_video,
+            audio_feature_lengths=audio_feature_lengths,
+            video_second_per_grid=video_second_per_grid,
+            **kwargs,
+        )
+        model_inputs["position_ids"] = None
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        # update attention_mask
+        if getattr(outputs, "attention_mask", None) is not None:
+            model_kwargs["attention_mask"] = outputs.attention_mask
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder, num_new_tokens
+        )
+        if getattr(outputs, "rope_deltas", None) is not None:
+            model_kwargs["rope_deltas"] = outputs.rope_deltas
+        if getattr(outputs, "thinker_reply_part", None) is not None:
+            model_kwargs["thinker_reply_part"] = outputs.thinker_reply_part
+        return model_kwargs
+# Using custom RoPE, will use LlamaRotaryEmbedding next version
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, x):
+        batch_size, seq_len = x.shape[0], x.shape[1]
+        t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i , j -> i j", t.type_as(self.inv_freq), self.inv_freq)
+        freqs = torch.stack((freqs, freqs), dim=-1)
+        freqs = freqs.reshape(*freqs.shape[:-2], -1)
+        freqs = freqs.repeat(batch_size, *([1] * freqs.dim()))
+        return freqs.cos(), freqs.sin()
+class TDNNBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.activation = nn.ReLU()
+    def forward(self, x):
+        return self.activation(self.conv(x))
+class Res2NetBlock(torch.nn.Module):
+    """An implementation of Res2NetBlock w/ dilation.
+    Arguments
+    ---------
+    in_channels : int
+        The number of channels expected in the input.
+    out_channels : int
+        The number of output channels.
+    scale : int
+        The scale of the Res2Net block.
+    kernel_size: int
+        The kernel size of the Res2Net block.
+    dilation : int
+        The dilation of the Res2Net block.
+    """
+    def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1):
+        super().__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+        self.blocks = nn.ModuleList(
+            [
+                TDNNBlock(
+                    in_channel,
+                    hidden_channel,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                )
+                for i in range(scale - 1)
+            ]
+        )
+        self.scale = scale
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y_p = torch.cat(y, dim=1)
+        return y_p
+class SEBlock(nn.Module):
+    """An implementation of squeeze-and-excitation block.
+    Arguments
+    ---------
+    in_channels : int
+        The number of input channels.
+    se_channels : int
+        The number of output channels after squeeze.
+    out_channels : int
+        The number of output channels.
+    """
+    def __init__(self, in_channels, se_channels, out_channels):
+        super().__init__()
+        self.conv1 = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=se_channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv1d(
+            in_channels=se_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        s = x.mean(dim=2, keepdim=True)
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+        return s * x
+class AttentiveStatisticsPooling(nn.Module):
+    """This class implements an attentive statistic pooling layer for each channel.
+    It returns the concatenated mean and std of the input tensor.
+    Arguments
+    ---------
+    channels: int
+        The number of input channels.
+    attention_channels: int
+        The number of attention channels.
+    """
+    def __init__(self, channels, attention_channels=128):
+        super().__init__()
+        self.eps = 1e-12
+        self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = nn.Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+    def _length_to_mask(self, length, max_len=None, dtype=None, device=None):
+        assert len(length.shape) == 1
+        if max_len is None:
+            max_len = length.max().long().item()  # using arange to generate mask
+        mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(
+            len(length), max_len
+        ) < length.unsqueeze(1)
+        mask = torch.as_tensor(mask, dtype=dtype, device=device)
+        return mask
+    def _compute_statistics(self, x, m, dim=2):
+        mean = (m * x).sum(dim)
+        std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(self.eps))
+        return mean, std
+    def forward(self, x):
+        """Calculates mean and std for a batch (input tensor).
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape [N, C, L].
+        """
+        L = x.shape[-1]
+        lengths = torch.ones(x.shape[0], device=x.device)
+        # Make binary mask of shape [N, 1, L]
+        mask = self._length_to_mask(lengths * L, max_len=L, dtype=x.dtype, device=x.device)
+        mask = mask.unsqueeze(1)
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        total = mask.sum(dim=2, keepdim=True)
+        mean, std = self._compute_statistics(x, mask / total)
+        mean = mean.unsqueeze(2).repeat(1, 1, L)
+        std = std.unsqueeze(2).repeat(1, 1, L)
+        attn = torch.cat([x, mean, std], dim=1)
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float("-inf"))
+        attn = F.softmax(attn, dim=2)
+        mean, std = self._compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+        return pooled_stats
+class SERes2NetBlock(nn.Module):
+    """An implementation of building block in ECAPA-TDNN, i.e.,
+    TDNN-Res2Net-TDNN-SEBlock.
+    Arguments
+    ----------
+    out_channels: int
+        The number of output channels.
+    res2net_scale: int
+        The scale of the Res2Net block.
+    kernel_size: int
+        The kernel size of the TDNN blocks.
+    dilation: int
+        The dilation of the Res2Net block.
+    activation : torch class
+        A class for constructing the activation layers.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+        )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels, res2net_scale, kernel_size, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+    def forward(self, x):
+        residual = x
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x)
+        return x + residual
+class ECAPA_TDNN(torch.nn.Module):
+    def __init__(self, config: Qwen2_5OmniDiTConfig):
+        super().__init__()
+        assert len(config.enc_channels) == len(config.enc_kernel_sizes)
+        assert len(config.enc_channels) == len(config.enc_dilations)
+        self.channels = config.enc_channels
+        self.blocks = nn.ModuleList()
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                config.mel_dim,
+                config.enc_channels[0],
+                config.enc_kernel_sizes[0],
+                config.enc_dilations[0],
+            )
+        )
+        # SE-Res2Net layers
+        for i in range(1, len(config.enc_channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    config.enc_channels[i - 1],
+                    config.enc_channels[i],
+                    res2net_scale=config.enc_res2net_scale,
+                    se_channels=config.enc_se_channels,
+                    kernel_size=config.enc_kernel_sizes[i],
+                    dilation=config.enc_dilations[i],
+                )
+            )
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            config.enc_channels[-1],
+            config.enc_channels[-1],
+            config.enc_kernel_sizes[-1],
+            config.enc_dilations[-1],
+        )
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            config.enc_channels[-1],
+            attention_channels=config.enc_attention_channels,
+        )
+        # Final linear transformation
+        self.fc = nn.Conv1d(
+            in_channels=config.enc_channels[-1] * 2,
+            out_channels=config.enc_dim,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+    def forward(self, x):
+        """Returns the embedding vector.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, time, channel).
+        """
+        # Minimize transpose for efficiency
+        x = x.transpose(1, 2)
+        xl = []
+        for layer in self.blocks:
+            x = layer(x)
+            xl.append(x)
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+        # Attentive Statistical Pooling
+        x = self.asp(x)
+        # Final linear transformation
+        x = self.fc(x)
+        x = x.squeeze(-1)
+        return x
+class InputEmbedding(nn.Module):
+    def __init__(self, config: Qwen2_5OmniDiTConfig):
+        super().__init__()
+        self.proj = nn.Linear(
+            config.mel_dim + config.enc_dim + config.enc_emb_dim + config.emb_dim,
+            config.hidden_size,
+        )
+        self.spk_encoder = ECAPA_TDNN(config)
+    def forward(self, x, spk, cond, code_embed, drop_audio_cond=False, code_embed_uncond=None, cfg=True):
+        if cfg:
+            x = torch.cat([x, x], dim=0)
+            spk = torch.cat([spk, torch.zeros_like(spk)], dim=0)
+            cond = torch.cat([cond, torch.zeros_like(cond)], dim=0)
+            code_embed = torch.cat([code_embed, code_embed_uncond], dim=0)
+        elif drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+            spk = torch.zeros_like(spk)
+        cond = self.spk_encoder(cond).unsqueeze(1).repeat(1, x.size(1), 1)
+        x = self.proj(torch.cat((x, cond, code_embed, spk), dim=-1))
+        return x
+# Transformer backbone using DiT blocks
+class CodecEmbedding(nn.Module):
+    def __init__(self, codec_num_embeds, codec_dim, repeats):
+        super().__init__()
+        self.repeats = repeats
+        self.codec_embed = nn.Embedding(codec_num_embeds + 1, codec_dim)
+    def forward(self, code, drop_code=False):
+        if drop_code:
+            code = torch.zeros_like(code)
+        code_embed = self.codec_embed(code)
+        code_embed = torch.repeat_interleave(code_embed, repeats=self.repeats, dim=1)
+        return code_embed
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        self.ff = nn.ModuleList(
+            [
+                nn.Linear(dim, inner_dim),
+                nn.GELU(approximate="tanh"),
+                nn.Dropout(dropout),
+                nn.Linear(inner_dim, dim),
+            ]
+        )
+    def forward(self, x):
+        for layer in self.ff:
+            x = layer(x)
+        return x
+# Modified from Llama with a different rotate function, will fixed in next release
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    def rotate_half_codec(x):
+        # x = rearrange(x, "... (d r) -> ... d r", r=2)
+        x = x.reshape(*x.shape[:-1], -1, 2)
+        x1, x2 = x.unbind(dim=-1)
+        x = torch.stack((-x2, x1), dim=-1)
+        return x.reshape(*x.shape[:-2], -1)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half_codec(q) * sin)
+    k_embed = (k * cos) + (rotate_half_codec(k) * sin)
+    return q_embed, k_embed
+class DiTAttention(nn.Module):
+    def __init__(self, config: Qwen2_5OmniDiTConfig):
+        super().__init__()
+        self.config = config
+        self.dim = config.hidden_size
+        self.heads = config.num_attention_heads
+        self.inner_dim = config.head_dim * config.num_attention_heads
+        self.dropout = config.dropout
+        self._attn_implementation = config._attn_implementation
+        self.is_causal = False
+        self.to_q = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_k = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_v = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_out = nn.ModuleList([nn.Linear(self.inner_dim, config.hidden_size), nn.Dropout(config.dropout)])
+    def forward(
+        self,
+        x,  # noised input x
+        rope=None,  # rotary position embedding for x
+        mask=None,
+    ) -> torch.Tensor:
+        batch_size = x.shape[0]
+        # `sample` projections.
+        query = self.to_q(x)
+        key = self.to_k(x)
+        value = self.to_v(x)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        # apply rotary position embedding
+        # Due to training process, only first head is applied with RoPE, will be fixed at next release
+        cos, sin = rope
+        query[:, :1], key[:, :1] = apply_rotary_pos_emb(query[:, :1], key[:, :1], cos, sin)
+        attention_interface = ALL_ATTENTION_FUNCTIONS[self._attn_implementation]
+        x, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=mask,
+            is_causal=False,
+        )
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        # x = F.scaled_dot_product_attention(query, key, value, attn_mask=mask, dropout_p=0.0, is_causal=False)
+        x = x.reshape(batch_size, -1, self.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = self.to_out[0](x)
+        # dropout
+        x = self.to_out[1](x)
+        return x
+# time step conditioning embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb.type_as(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.ModuleList([nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim)])
+    def forward(self, timestep):  # noqa: F821
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        for layer in self.time_mlp:
+            time_hidden = layer(time_hidden)  # b d
+        return time_hidden
+class DiTBlock(nn.Module):
+    def __init__(self, config: Qwen2_5OmniDiTConfig, look_ahead_block=0, look_backward_block=0):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(config.hidden_size)
+        self.attn = DiTAttention(config)
+        self.look_ahead_block = look_ahead_block
+        self.look_backward_block = look_backward_block
+        self.ff_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=config.hidden_size, mult=config.ff_mult, dropout=config.dropout)
+    def forward(self, x, t, rope=None, block_diff=None):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(
+            x=norm,
+            rope=rope,
+            mask=(block_diff >= -float(self.look_backward_block)) & (block_diff <= float(self.look_ahead_block)),
+        )
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+class SnakeBeta(nn.Module):
+    def __init__(self, in_features, alpha=1.0):
+        super().__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        self.beta = Parameter(torch.zeros(in_features) * alpha)
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        alpha = torch.exp(alpha)
+        beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * torch.sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter, persistent=False)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.ratio * F.conv_transpose1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left : -self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        cutoff = 0.5 / ratio
+        half_width = 0.6 / ratio
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = ratio
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter, persistent=False)
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad_left, self.pad_right), mode="replicate")
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        return out
+class TorchActivation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x
+class AMPBlock(torch.nn.Module):
+    def __init__(
+        self,
+        channels,
+        kernel_size=3,
+        dilation=(1, 3, 5),
+    ):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=self._get_padding(kernel_size, dilation[0]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=self._get_padding(kernel_size, dilation[1]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=self._get_padding(kernel_size, dilation[2]),
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=self._get_padding(kernel_size, 1),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=self._get_padding(kernel_size, 1),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=self._get_padding(kernel_size, 1),
+                ),
+            ]
+        )
+        self.num_layers = len(self.convs1) + len(self.convs2)  # total number of conv layers
+        self.activations = nn.ModuleList(
+            [TorchActivation1d(activation=SnakeBeta(channels)) for _ in range(self.num_layers)]
+        )
+    def _get_padding(self, kernel_size, dilation=1):
+        return int((kernel_size * dilation - dilation) / 2)
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniBigVGANConfig
+    def __init__(self, config: Qwen2_5OmniBigVGANConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        # pre conv
+        self.conv_pre = nn.Conv1d(config.mel_dim, config.upsample_initial_channel, 7, 1, padding=3)
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        ConvTranspose1d(
+                            config.upsample_initial_channel // (2**i),
+                            config.upsample_initial_channel // (2 ** (i + 1)),
+                            k,
+                            u,
+                            padding=(k - u) // 2,
+                        )
+                    ]
+                )
+            )
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = config.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)):
+                self.resblocks.append(AMPBlock(ch, k, d))
+        # post conv
+        self.activation_post = TorchActivation1d(activation=SnakeBeta(ch))
+        self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+    def _normalize(self, S, max_abs_value, min_db):
+        return torch.clamp(
+            (2 * max_abs_value) * ((S - min_db) / (-min_db)) - max_abs_value, -max_abs_value, max_abs_value
+        )
+    def _amp_to_db(self, x, min_level_db):
+        min_level = np.exp(min_level_db / 20 * np.log(10))
+        min_level = torch.ones_like(x) * min_level
+        return 20 * torch.log10(torch.maximum(min_level, x))
+    def apm_to_db(self, apm_mel):
+        mel_spec = torch.exp(apm_mel)
+        mel_spec = self._amp_to_db(mel_spec, -115) - 20
+        mel_spec = self._normalize(mel_spec, 1, -115)
+        return mel_spec
+    def forward(self, apm_mel):
+        mel_spec = self.apm_to_db(apm_mel)
+        # pre conv
+        hidden = self.conv_pre(mel_spec)
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                ups_i = cast(nn.Sequential, self.ups[i])
+                hidden = ups_i[i_up](hidden)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](hidden)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](hidden)
+            assert xs is not None
+            hidden = xs / self.num_kernels
+        # post conv
+        hidden = self.activation_post(hidden)
+        hidden = self.conv_post(hidden)
+        audio = torch.clamp(hidden, min=-1.0, max=1.0)  # bound the output to [-1, 1]
+        return audio.squeeze().cpu()
+class ODESolverRK4:
+    def __init__(self, func, y0):
+        self.func = func
+        self.y0 = y0
+        self._one_third = 1 / 3
+        self._two_thirds = 2 / 3
+    def _rk4_alt_step_func(self, func, t0, dt, t1, y0, f0=None):
+        k1 = f0
+        if k1 is None:
+            k1 = func(t0, y0)
+        k2 = func(t0 + dt * self._one_third, y0 + dt * k1 * self._one_third)
+        k3 = func(t0 + dt * self._two_thirds, y0 + dt * (k2 - k1 * self._one_third))
+        k4 = func(t1, y0 + dt * (k1 - k2 + k3))
+        return (k1 + 3 * (k2 + k3) + k4) * dt * 0.125
+    def _step_func(self, func, t0, dt, t1, y0):
+        f0 = func(t0, y0)
+        return self._rk4_alt_step_func(func, t0, dt, t1, y0, f0=f0), f0
+    def _linear_interp(self, t0, t1, y0, y1, t):
+        if t == t0:
+            return y0
+        if t == t1:
+            return y1
+        slope = (t - t0) / (t1 - t0)
+        return y0 + slope * (y1 - y0)
+    def integrate(self, t):
+        solution = torch.empty(len(t), *self.y0.shape, dtype=self.y0.dtype, device=self.y0.device)
+        solution[0] = self.y0
+        j = 1
+        y0 = self.y0
+        for t0, t1 in zip(t[:-1], t[1:]):
+            dt = t1 - t0
+            dy, f0 = self._step_func(self.func, t0, dt, t1, y0)
+            y1 = y0 + dy
+            while j < len(t) and t1 >= t[j]:
+                solution[j] = self._linear_interp(t0, t1, y0, y1, t[j])
+                j += 1
+            y0 = y1
+        return solution
+class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniDiTConfig
+    _no_split_modules = ["DiTBlock"]
+    def __init__(self, config: Qwen2_5OmniDiTConfig):
+        super().__init__(config)
+        self.mel_dim = config.mel_dim
+        self.repeats = config.repeats
+        self.time_embed = TimestepEmbedding(config.hidden_size)
+        self.text_embed = CodecEmbedding(config.num_embeds, config.emb_dim, config.repeats)
+        self.input_embed = InputEmbedding(config)
+        self.rotary_embed = RotaryEmbedding(config.head_dim)
+        # self.rotary_embed = Qwen2_5OmniDiTRotaryEmbedding(config)
+        self.hidden_size = config.hidden_size
+        self.layers = config.num_hidden_layers
+        self.block_size = config.block_size
+        self.num_attention_heads = config.num_attention_heads
+        self.transformer_blocks = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            self.transformer_blocks.append(
+                DiTBlock(
+                    config,
+                    look_ahead_block=1 if i in config.look_ahead_layers else 0,
+                    look_backward_block=1 if i in config.look_backward_layers else 0,
+                )
+            )
+        self.norm_out = AdaLayerNormZero_Final(config.hidden_size)  # final modulation
+        self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
+    def _create_block_diff(self, x):
+        batch, seq_len = x.shape[0], x.shape[1]
+        block_indices = torch.arange(seq_len, device=x.device) // self.block_size  # [seq_length]
+        block_i = block_indices.unsqueeze(1)  # [seq_length, 1]
+        block_j = block_indices.unsqueeze(0)  # [1, seq_length]
+        block_diff = block_j - block_i  # (n, n)
+        return block_diff.expand(batch, self.num_attention_heads, seq_len, seq_len)
+    def forward(
+        self,
+        x,  # nosied input audio
+        cond,  # masked cond audio
+        spk,  # spk embedding
+        code,  # code
+        time,  # time step  # noqa: F821 F722
+        drop_audio_cond=False,  # cfg for cond audio
+        drop_code=False,  # cfg for code
+        cfg=True,
+    ):
+        batch = x.shape[0]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (code + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        code_embed = self.text_embed(code, drop_code=False if cfg else drop_code)
+        code_embed_uncond = self.text_embed(code, drop_code=True) if cfg else None
+        hidden = self.input_embed(
+            x,
+            spk,
+            cond,
+            code_embed,
+            drop_audio_cond=drop_audio_cond,
+            code_embed_uncond=code_embed_uncond,
+            cfg=cfg,
+        )
+        # rope = self.rotary_embed(x, torch.arange(seq_len, device=x.device).repeat(batch, 1))
+        rope = self.rotary_embed(hidden)
+        block_diff = self._create_block_diff(hidden)
+        for block in self.transformer_blocks:
+            hidden = block(hidden, t, rope=rope, block_diff=block_diff)
+        hidden = self.norm_out(hidden, t)
+        output = self.proj_out(hidden)
+        return output
+    @torch.no_grad()
+    def sample(
+        self,
+        cond,
+        ref_mel,
+        code,
+        steps=10,
+        cfg_strength=0.5,
+        sway_sampling_coef=-1.0,
+    ):
+        y_all = torch.randn([1, 30000, self.mel_dim], dtype=ref_mel.dtype)
+        max_duration = code.shape[1] * self.repeats
+        y0 = y_all[:, :max_duration].to(code.device)
+        batch = ref_mel.shape[0]
+        cond = cond.unsqueeze(1).repeat(1, max_duration, 1)
+        assert batch == 1, "only support batch size = 1 currently"
+        def fn(t, x):
+            if cfg_strength < 1e-5:
+                pred = self(x=x, spk=cond, cond=ref_mel, code=code, time=t, drop_audio_cond=False, drop_code=False)
+                return pred
+            out_put = self(x=x, code=code, spk=cond, cond=ref_mel, time=t, cfg=True)
+            pred, null_pred = torch.chunk(out_put, 2, dim=0)
+            return pred + (pred - null_pred) * cfg_strength
+        t_start = 0
+        t = torch.linspace(t_start, 1, steps, device=code.device, dtype=cond.dtype)
+        if sway_sampling_coef is not None:
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+        solver = ODESolverRK4(func=fn, y0=y0)
+        trajectory = solver.integrate(t)
+        generated = trajectory[-1]
+        generated_mel_spec = generated.permute(0, 2, 1)
+        return generated_mel_spec
+@add_start_docstrings(
+    (
+        "The full Qwen2.5Omni Token2Wav model. Consists a DiT model take speech"
+        " tokens as input and predict mel spectrogram and a BigVGAN vocoder take"
+        " mel spectrogram as input and predict waveform."
+    ),
+    QWEN2_5OMNI_START_DOCSTRING.format(config_class="Qwen2_5OmniToken2WavConfig"),
+)
+class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniToken2WavConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["Qwen2_5OmniToken2WavDiTModel", "Qwen2_5OmniToken2WavBigVGANModel"]
+    def __init__(self, config: Qwen2_5OmniToken2WavConfig):
+        super().__init__(config)
+        attn_impl = config._attn_implementation
+        if config._attn_implementation == "flash_attention_2":
+            logger.warning_once(
+                "Qwen2_5OmniToken2WavModel must inference with fp32, but "
+                "flash_attention_2 only supports fp16 and bf16, "
+                "attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa."
+            )
+            attn_impl = "sdpa"
+        elif config._attn_implementation == "eager":
+            logger.warning_once(
+                "Qwen2_5OmniToken2WavModel does not support eager attention implementation, " "fall back to sdpa"
+            )
+            attn_impl = "sdpa"
+        self.code2wav_dit_model = Qwen2_5OmniToken2WavDiTModel._from_config(
+            config.dit_config, attn_implementation=attn_impl
+        )
+        self.code2wav_bigvgan_model = Qwen2_5OmniToken2WavBigVGANModel._from_config(
+            config.bigvgan_config, attn_implementation=attn_impl
+        )
+    def forward(
+        self,
+        code,
+        cond,
+        ref_mel,
+        steps=10,
+        cfg_strength=0.5,
+        sway_sampling_coef=-1.0,
+        **kwargs,
+    ):
+        generated_mel = self.code2wav_dit_model.sample(
+            cond,
+            ref_mel,
+            code,
+            steps=steps,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+        )
+        waveform = self.code2wav_bigvgan_model(generated_mel)
+        return waveform
+@add_start_docstrings(
+    """""",
+    QWEN2_5OMNI_START_DOCSTRING.format(config_class=Qwen2_5OmniConfig),
+)
+class Qwen2_5OmniModel(Qwen2_5OmniPreTrainedModel):
+    config_class = Qwen2_5OmniConfig
+    _no_split_modules = [
+        "Qwen2_5OmniTalkerForConditionalGeneration",
+        "Qwen2_5OmniToken2WavModel",
+    ]
+    def __init__(self, config):
+        super().__init__(config)
+        self.thinker = Qwen2_5OmniThinkerForConditionalGeneration(config.thinker_config)
+        self.has_talker = config.enable_audio_output
+        self.speaker_map = {}
+        if config.enable_audio_output:
+            self.enable_talker()
+    def enable_talker(self):
+        self.talker = Qwen2_5OmniTalkerForConditionalGeneration(self.config.talker_config)
+        self.token2wav = Qwen2_5OmniToken2WavModel(self.config.token2wav_config)
+        self.token2wav.float()
+        self.has_talker = True
+    def load_speakers(self, path):
+        for key, value in torch.load(path).items():
+            self.speaker_map[key] = value
+        logger.info("Speaker {} loaded".format(list(self.speaker_map.keys())))
+    def disable_talker(self):
+        if hasattr(self, "talker"):
+            del self.talker
+        if hasattr(self, "token2wav"):
+            del self.token2wav
+        self.has_talker = False
+    @classmethod
+    def can_generate(cls) -> bool:
+        return True
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        *model_args,
+        config=None,
+        cache_dir=None,
+        ignore_mismatched_sizes=False,
+        force_download=False,
+        local_files_only=False,
+        token=None,
+        revision="main",
+        use_safetensors=None,
+        weights_only=True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        spk_path = cached_file(
+            pretrained_model_name_or_path,
+            "spk_dict.pt",
+            subfolder=kwargs.pop("subfolder", None),
+            cache_dir=kwargs.pop("cache_dir", None),
+            force_download=kwargs.pop("force_download", False),
+            proxies=kwargs.pop("proxies", None),
+            resume_download=kwargs.pop("resume_download", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("use_auth_token", None),
+            revision=kwargs.pop("revision", None),
+        )
+        if spk_path is None:
+            raise ValueError(f"""{pretrained_model_name_or_path}/{spk_path} not exists""")
+        model.load_speakers(spk_path)
+        return model
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        spk: str = "Chelsie",
+        use_audio_in_video: bool = False,
+        return_audio: Optional[bool] = None,
+        thinker_max_new_tokens: int = 1024,
+        talker_max_new_tokens: int = 4096,
+        talker_do_sample: bool = True,
+        talker_top_k: int = 40,
+        talker_top_p: float = 0.8,
+        talker_temperature: float = 0.9,
+        talker_eos_token_id: list[int] = [8292, 8294],
+        talker_repetition_penalty: float = 1.05,
+        **kwargs,
+    ):
+        if spk not in self.speaker_map:
+            raise ValueError(f"{spk} is not availible, availible speakers: {self.speaker_map.keys()}")
+        if return_audio and not self.has_talker:
+            raise ValueError(
+                "Cannot use talker when talker module not initalized. Use `enable_talker` "
+                "method or set enable_talker in config to enable talker."
+            )
+        if return_audio is None:
+            return_audio = self.has_talker
+        assert input_ids is not None
+        if input_ids.shape[0] != 1 and return_audio:
+            raise NotImplementedError("Qwen2.5-Omni currently does not support batched inference with audio output")
+        shared_kwargs = {"use_audio_in_video": use_audio_in_video}
+        thinker_kwargs = {
+            "max_new_tokens": thinker_max_new_tokens,
+        }
+        talker_kwargs: dict[str, Union[torch.Tensor, Any]] = {
+            "max_new_tokens": talker_max_new_tokens,
+            "do_sample": talker_do_sample,
+            "top_k": talker_top_k,
+            "top_p": talker_top_p,
+            "temperature": talker_temperature,
+            "eos_token_id": talker_eos_token_id,
+            "repetition_penalty": talker_repetition_penalty,
+        }
+        token2wav_kwargs = {}
+        for key, value in kwargs.items():
+            if key.startswith("thinker_"):
+                thinker_kwargs[key[len("thinker_") :]] = value
+            elif key.startswith("talker_"):
+                talker_kwargs[key[len("talker_") :]] = value
+            elif key.startswith("token2wav_"):
+                token2wav_kwargs[key[len("token2wav_") :]] = value
+            # Process special input values
+            elif key == "feature_attention_mask":
+                thinker_kwargs[key] = value
+                talker_kwargs["audio_feature_lengths"] = torch.sum(value, dim=1)
+            elif key == "input_features" or key == "attention_mask":
+                thinker_kwargs[key] = value
+            # Put other key to shared kwargs
+            else:
+                shared_kwargs[key] = value
+        # Merge kwargs
+        for key, value in shared_kwargs.items():
+            if key not in thinker_kwargs:
+                thinker_kwargs[key] = value
+            if key not in talker_kwargs:
+                talker_kwargs[key] = value
+            if key not in token2wav_kwargs:
+                token2wav_kwargs[key] = value
+        speaker_params = self.speaker_map[spk]
+        # 1. Generate from thinker module
+        thinker_result = self.thinker.generate(
+            input_ids=input_ids,
+            return_dict_in_generate=True,
+            output_hidden_states=True,
+            **thinker_kwargs,
+        )
+        if not (return_audio and self.has_talker):
+            return thinker_result.sequences
+        # 2. Generate speech tokens from talker module
+        thinker_generate_ids = thinker_result.sequences[:, input_ids.size(1) :].to(self.talker.device)
+        thinker_token_embeds = [x[0].to(self.talker.device) for x in thinker_result.hidden_states]
+        thinker_hidden_states = [x[1][-1].to(self.talker.device) for x in thinker_result.hidden_states]
+        talker_text_bos_token = speaker_params["bos_token"]
+        talker_input_text_ids = torch.cat(
+            [
+                input_ids.to(self.talker.device),
+                torch.tensor([[talker_text_bos_token]], dtype=torch.long, device=self.talker.device),
+                thinker_generate_ids[:, :1],
+            ],
+            dim=-1,
+        )
+        talker_input_ids = torch.cat(
+            [
+                torch.full_like(input_ids, fill_value=self.talker.codec_mask_token, device=self.talker.device),
+                torch.tensor([[self.talker.codec_pad_token]], dtype=torch.long, device=self.talker.device),
+                torch.tensor([[self.talker.codec_bos_token]], dtype=torch.long, device=self.talker.device),
+            ],
+            dim=1,
+        )
+        thinker_reply_part = torch.cat(thinker_hidden_states[1:], dim=1) + torch.cat(thinker_token_embeds[1:], dim=1)
+        talker_inputs_embeds = thinker_hidden_states[0] + thinker_token_embeds[0]
+        talker_inputs_embeds = torch.cat(
+            [
+                talker_inputs_embeds,
+                self.thinker.get_input_embeddings()(
+                    torch.tensor([[talker_text_bos_token]], dtype=torch.long, device=self.thinker.device)
+                ).to(self.talker.device),
+                thinker_reply_part[:, :1, :],
+            ],
+            dim=1,
+        )
+        thinker_reply_part = torch.cat(
+            [
+                thinker_reply_part[:, 1:, :],
+                self.thinker.get_input_embeddings()(
+                    torch.tensor([[self.talker.text_eos_token]], dtype=torch.long, device=self.thinker.device)
+                ).to(self.talker.device),
+                self.thinker.get_input_embeddings()(
+                    torch.tensor([[self.talker.text_pad_token]], dtype=torch.long, device=self.thinker.device)
+                ).to(self.talker.device),
+            ],
+            dim=1,
+        )
+        talker_attention_mask = torch.cat(
+            [kwargs["attention_mask"], kwargs["attention_mask"].new_ones((1, 2))], dim=1
+        ).to(self.talker.device)
+        talker_result = self.talker.generate(
+            input_ids=talker_input_ids,
+            input_text_ids=talker_input_text_ids,
+            thinker_reply_part=thinker_reply_part,
+            inputs_embeds=talker_inputs_embeds,
+            attention_mask=talker_attention_mask,
+            suppress_tokens=[self.talker.codec_bos_token],
+            **{k: (v.to(self.talker.device) if torch.is_tensor(v) else v) for k, v in talker_kwargs.items()},
+        )
+        talker_generate_codes = talker_result[:, talker_input_ids.shape[1] : -1]
+        # 3. Generate wavs from code
+        if self.token2wav.dtype != torch.float:
+            self.token2wav.float()
+        wav = self.token2wav(
+            talker_generate_codes.to(self.token2wav.device),
+            cond=speaker_params["cond"].to(self.token2wav.device).float(),
+            ref_mel=speaker_params["ref_mel"].to(self.token2wav.device).float(),
+            **token2wav_kwargs,
+        )
+        return thinker_result.sequences, wav.float()
+__all__ = [
+    "Qwen2_5OmniModel",
+    "Qwen2_5OmniThinkerModel",
+    "Qwen2_5OmniThinkerForConditionalGeneration",
+    "Qwen2_5OmniTalkerModel",
+    "Qwen2_5OmniTalkerForConditionalGeneration",
+    "Qwen2_5OmniToken2WavDiTModel",
+    "Qwen2_5OmniToken2WavBigVGANModel",
+    "Qwen2_5OmniToken2WavModel",
+    "Qwen2_5OmniPreTrainedModel",
+    "Qwen2_5OmniPreTrainedModelForConditionalGeneration",
+]