PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show

helm/clients/audio_language/qwen_audiolm_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from helm.common.cache import CacheConfig
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.request import wrap_request_time
@@ -124,6 +124,7 @@ class QwenAudioLMClient(CachingClient):
                     )
                     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
                 except RuntimeError as model_error:
+                    hexception(model_error)
                     return RequestResult(
                         success=False, cached=False, error=str(model_error), completions=[], embedding=[]
                     )

helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py ADDED Viewed

@@ -0,0 +1,519 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen2_5_omni.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen2_5OmniVisionEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2_5OmniThinkerVision`].
+    It is used to instantiate a Qwen2.5-VL vision encoder according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a
+    similar configuration to that of the audio encoder of the Qwen2.5-VL architecture.
+    e.g. [Qwen/Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the
+    model outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        depth (`int`, *optional*, defaults to 32):
+            Number of layers (depth) in the model.
+        hidden_size (`int`, *optional*, defaults to 3584):
+            The size of the hidden layers.
+        hidden_act (`str`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function used in the model. Supported options include
+            `"quick_gelu"` and others as applicable.
+        mlp_ratio (`float`, *optional*, defaults to 4):
+            The ratio used to determine the size of the MLP (Multi-Layer Perceptron) hidden layer.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size of the patches extracted from the input.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size used for merging spatial dimensions.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The size used for patches along the temporal dimension.
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from transformers import Qwen2_5OmniVisionEncoderConfig, Qwen2_5OmniVisionEncoder
+        >>> # Initializing a Qwen2_5OmniVisionEncoderConfig
+        >>> configuration = Qwen2_5OmniVisionEncoderConfig()
+        >>> # Initializing a Qwen2_5OmniVisionEncoder (with random weights)
+        >>> model = Qwen2_5OmniVisionEncoder(configuration)
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "qwen2_5_omni_vision_encoder"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+class Qwen2_5OmniAudioEncoderConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_audio_encoder"
+    def __init__(
+        self,
+        num_mel_bins=128,
+        encoder_layers=32,
+        encoder_attention_heads=20,
+        encoder_ffn_dim=5120,
+        d_model=1280,
+        dropout=0,
+        attention_dropout=0,
+        activation_function="gelu",
+        activation_dropout=0,
+        scale_embedding=False,
+        initializer_range=0.02,
+        max_source_positions=1500,
+        n_window=100,
+        output_dim=3584,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.num_hidden_layers = encoder_layers
+        self.initializer_range = initializer_range
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_source_positions = max_source_positions
+        self.n_window = n_window
+        self.output_dim = output_dim
+class Qwen2_5OmniTextConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_text"
+    is_composition = False
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=3584,
+        intermediate_size=18944,
+        num_hidden_layers=28,
+        num_attention_heads=28,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=32768,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        if self.rope_scaling is None:
+            self.rope_scaling = {"mrope_section": [16, 24, 24], "rope_type": "default", "type": "default"}
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+class Qwen2_5OmniThinkerConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_thinker"
+    sub_configs = {
+        "audio_config": Qwen2_5OmniAudioEncoderConfig,
+        "vision_config": Qwen2_5OmniVisionEncoderConfig,
+        "text_config": Qwen2_5OmniTextConfig,
+    }
+    is_composition = True
+    def __init__(
+        self,
+        audio_config=None,
+        vision_config=None,
+        text_config=None,
+        audio_token_index=151646,
+        image_token_index=151655,
+        video_token_index=151656,
+        tie_word_embeddings=False,
+        position_id_per_seconds=25,
+        seconds_per_chunk=2,
+        audio_start_token_id=151647,
+        audio_end_token_id=151648,
+        user_token_id=872,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.audio_token_index = audio_token_index
+        self.image_token_index = image_token_index
+        self.video_token_index = video_token_index
+        # 2025.02.20 the add
+        self.user_token_id = user_token_id
+        self.position_id_per_seconds = position_id_per_seconds
+        self.seconds_per_chunk = seconds_per_chunk
+        self.audio_start_token_id = audio_start_token_id
+        self.audio_end_token_id = audio_end_token_id
+        self.initializer_range = initializer_range
+        if isinstance(vision_config, dict):
+            vision_config = Qwen2_5OmniVisionEncoderConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = Qwen2_5OmniVisionEncoderConfig()
+        self.vision_config = vision_config
+        if isinstance(audio_config, dict):
+            audio_config = Qwen2_5OmniAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Qwen2_5OmniAudioEncoderConfig()
+        self.audio_config = audio_config
+        if isinstance(text_config, dict):
+            text_config = Qwen2_5OmniTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Qwen2_5OmniTextConfig()
+        self.text_config = text_config
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class Qwen2_5OmniTalkerConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_talker"
+    is_composition = False
+    def __init__(
+        self,
+        audio_token_index=151646,
+        image_token_index=151655,
+        video_token_index=151656,
+        vocab_size=8448,
+        tts_text_start_token_id=151860,
+        tts_text_end_token_id=151861,
+        tts_text_pad_token_id=151859,
+        tts_codec_start_token_id=8293,
+        tts_codec_end_token_id=8294,
+        tts_codec_pad_token_id=8292,
+        tts_codec_mask_token_id=8296,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        embedding_size=3584,
+        hidden_size=3584,
+        intermediate_size=18944,
+        num_hidden_layers=28,
+        num_attention_heads=28,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        rms_norm_eps=1e-06,
+        head_dim=128,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=32768,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        position_id_per_seconds=25,
+        seconds_per_chunk=2,
+        audio_start_token_id=151647,
+        audio_end_token_id=151648,
+        initializer_range=0.02,
+        spatial_merge_size=2,
+        **kwargs,
+    ):
+        self.audio_token_index = audio_token_index
+        self.image_token_index = image_token_index
+        self.video_token_index = video_token_index
+        self.tts_text_start_token_id = tts_text_start_token_id
+        self.tts_text_end_token_id = tts_text_end_token_id
+        self.tts_text_pad_token_id = tts_text_pad_token_id
+        self.tts_codec_start_token_id = tts_codec_start_token_id
+        self.tts_codec_end_token_id = tts_codec_end_token_id
+        self.tts_codec_pad_token_id = tts_codec_pad_token_id
+        self.tts_codec_mask_token_id = tts_codec_mask_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        self.vocab_size = vocab_size
+        self.head_dim = head_dim
+        self.embedding_size = embedding_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.position_id_per_seconds = position_id_per_seconds  # zf
+        self.seconds_per_chunk = seconds_per_chunk  # zf
+        self.audio_start_token_id = audio_start_token_id  # zf
+        self.audio_end_token_id = audio_end_token_id  # zf
+        self.initializer_range = initializer_range
+        self.spatial_merge_size = spatial_merge_size
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class Qwen2_5OmniDiTConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_dit"
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=22,
+        num_attention_heads=16,
+        ff_mult=2,
+        emb_dim=512,
+        head_dim=64,
+        rope_theta=10000.0,
+        max_position_embeddings=32768,
+        block_size=24,
+        look_ahead_layers=[10],
+        look_backward_layers=[0, 20],
+        repeats=2,
+        num_embeds=8193,
+        mel_dim=80,
+        dropout=0.1,
+        enc_emb_dim=192,
+        enc_dim=128,
+        enc_channels=[256, 256, 256, 256, 768],
+        enc_kernel_sizes=[5, 3, 3, 3, 1],
+        enc_dilations=[1, 2, 3, 4, 1],
+        enc_attention_channels=64,
+        enc_res2net_scale=2,
+        enc_se_channels=64,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.ff_mult = ff_mult
+        self.emb_dim = emb_dim
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.block_size = block_size
+        self.look_ahead_layers = look_ahead_layers
+        self.look_backward_layers = look_backward_layers
+        self.repeats = repeats
+        self.num_embeds = num_embeds
+        self.mel_dim = mel_dim
+        self.dropout = dropout
+        self.enc_emb_dim = enc_emb_dim
+        self.enc_dim = enc_dim
+        self.enc_channels = enc_channels
+        self.enc_kernel_sizes = enc_kernel_sizes
+        self.enc_dilations = enc_dilations
+        self.enc_attention_channels = enc_attention_channels
+        self.enc_res2net_scale = enc_res2net_scale
+        self.enc_se_channels = enc_se_channels
+        super().__init__(**kwargs)
+class Qwen2_5OmniBigVGANConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_bigvgan"
+    def __init__(
+        self,
+        mel_dim=80,
+        upsample_initial_channel=1536,
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        upsample_rates=[5, 3, 2, 2, 2, 2],
+        upsample_kernel_sizes=[11, 7, 4, 4, 4, 4],
+        **kwargs,
+    ):
+        self.mel_dim = mel_dim
+        self.upsample_initial_channel = upsample_initial_channel
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        super().__init__(**kwargs)
+class Qwen2_5OmniToken2WavConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni_token2wav"
+    sub_configs = {
+        "dit_config": Qwen2_5OmniDiTConfig,
+        "bigvgan_config": Qwen2_5OmniBigVGANConfig,
+    }
+    is_composition = True
+    def __init__(self, dit_config=None, bigvgan_config=None, **kwargs):
+        if dit_config is None:
+            dit_config = {}
+        if bigvgan_config is None:
+            bigvgan_config = {}
+        self.dit_config = Qwen2_5OmniDiTConfig(**dit_config)
+        self.bigvgan_config = Qwen2_5OmniBigVGANConfig(**bigvgan_config)
+        super().__init__(**kwargs)
+class Qwen2_5OmniConfig(PretrainedConfig):
+    model_type = "qwen2_5_omni"
+    sub_configs = {
+        "thinker_config": Qwen2_5OmniThinkerConfig,
+        "talker_config": Qwen2_5OmniTalkerConfig,
+        "token2wav_config": Qwen2_5OmniToken2WavConfig,
+    }
+    is_composition = True
+    def __init__(
+        self,
+        thinker_config=None,
+        talker_config=None,
+        token2wav_config=None,
+        enable_audio_output: bool = True,
+        **kwargs,
+    ):
+        if thinker_config is None:
+            thinker_config = {}
+            logger.info("thinker_config is None. Initializing thinker model with default values")
+        if talker_config is None:
+            talker_config = {}
+            logger.info("talker_config is None. Initializing talker model with default values")
+        if token2wav_config is None:
+            token2wav_config = {}
+            logger.info("token2wav_config is None. Initializing token2wav model with default values")
+        self.thinker_config = Qwen2_5OmniThinkerConfig(**thinker_config)
+        self.talker_config = Qwen2_5OmniTalkerConfig(**talker_config)
+        self.token2wav_config = Qwen2_5OmniToken2WavConfig(**token2wav_config)
+        self.enable_audio_output = enable_audio_output
+        super().__init__(**kwargs)
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        thinker_config: Qwen2_5OmniThinkerConfig,
+        talker_config: Qwen2_5OmniTalkerConfig,
+        token2wav_config: Qwen2_5OmniToken2WavConfig,
+        enable_audio_output: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Qwen2_5OmniConfig`] (or a derived class) from sub-models configuration.
+        Returns:
+            [`Qwen2_5OmniConfig`]: An instance of a configuration object
+        """
+        return cls(
+            thinker_config=thinker_config.to_dict(),
+            talker_config=talker_config.to_dict(),
+            token2wav_config=token2wav_config.to_dict(),
+            enable_audio_output=enable_audio_output,
+            **kwargs,
+        )
+__all__ = ["Qwen2_5OmniConfig", "Qwen2_5OmniThinkerConfig", "Qwen2_5OmniTalkerConfig", "Qwen2_5OmniToken2WavConfig"]