languagebind 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. languagebind-0.1.0/PKG-INFO +71 -0
  2. languagebind-0.1.0/README.md +42 -0
  3. languagebind-0.1.0/languagebind/__init__.py +91 -0
  4. languagebind-0.1.0/languagebind/_compat.py +24 -0
  5. languagebind-0.1.0/languagebind/audio/__init__.py +0 -0
  6. languagebind-0.1.0/languagebind/audio/configuration_audio.py +420 -0
  7. languagebind-0.1.0/languagebind/audio/modeling_audio.py +1031 -0
  8. languagebind-0.1.0/languagebind/audio/processing_audio.py +174 -0
  9. languagebind-0.1.0/languagebind/audio/tokenization_audio.py +78 -0
  10. languagebind-0.1.0/languagebind/depth/__init__.py +0 -0
  11. languagebind-0.1.0/languagebind/depth/configuration_depth.py +415 -0
  12. languagebind-0.1.0/languagebind/depth/modeling_depth.py +1031 -0
  13. languagebind-0.1.0/languagebind/depth/processing_depth.py +108 -0
  14. languagebind-0.1.0/languagebind/depth/tokenization_depth.py +78 -0
  15. languagebind-0.1.0/languagebind/image/__init__.py +0 -0
  16. languagebind-0.1.0/languagebind/image/configuration_image.py +413 -0
  17. languagebind-0.1.0/languagebind/image/modeling_image.py +1031 -0
  18. languagebind-0.1.0/languagebind/image/processing_image.py +77 -0
  19. languagebind-0.1.0/languagebind/image/tokenization_image.py +78 -0
  20. languagebind-0.1.0/languagebind/thermal/__init__.py +0 -0
  21. languagebind-0.1.0/languagebind/thermal/configuration_thermal.py +413 -0
  22. languagebind-0.1.0/languagebind/thermal/modeling_thermal.py +1031 -0
  23. languagebind-0.1.0/languagebind/thermal/processing_thermal.py +77 -0
  24. languagebind-0.1.0/languagebind/thermal/tokenization_thermal.py +78 -0
  25. languagebind-0.1.0/languagebind/video/__init__.py +0 -0
  26. languagebind-0.1.0/languagebind/video/configuration_video.py +413 -0
  27. languagebind-0.1.0/languagebind/video/modeling_video.py +1143 -0
  28. languagebind-0.1.0/languagebind/video/processing_video.py +174 -0
  29. languagebind-0.1.0/languagebind/video/tokenization_video.py +78 -0
  30. languagebind-0.1.0/makefile +9 -0
  31. languagebind-0.1.0/pyproject.toml +39 -0
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: languagebind
3
+ Version: 0.1.0
4
+ Summary: LanguageBind: multimodal (video/audio/image/depth/thermal) embedding model aligned to text via language-based semantic alignment. Packaged for pip-installability with compatibility patches for modern transformers/torchvision/torchaudio.
5
+ Project-URL: Homepage, https://github.com/PKU-YuanGroup/LanguageBind
6
+ Project-URL: Source, https://github.com/embeddings-benchmark/languagebind
7
+ License: MIT
8
+ Requires-Python: >=3.9
9
+ Requires-Dist: einops>=0.8.0
10
+ Requires-Dist: peft>=0.11.0
11
+ Requires-Dist: torch>=2.0.0
12
+ Requires-Dist: transformers<5.0.0,>=4.40.0
13
+ Provides-Extra: all
14
+ Requires-Dist: decord>=0.6.0; extra == 'all'
15
+ Requires-Dist: opencv-python-headless>=4.5.0; extra == 'all'
16
+ Requires-Dist: pytorchvideo>=0.1.5; extra == 'all'
17
+ Requires-Dist: soundfile>=0.12.0; extra == 'all'
18
+ Requires-Dist: torchaudio>=0.13.0; extra == 'all'
19
+ Requires-Dist: torchvision>=0.15.0; extra == 'all'
20
+ Provides-Extra: audio
21
+ Requires-Dist: soundfile>=0.12.0; extra == 'audio'
22
+ Requires-Dist: torchaudio>=0.13.0; extra == 'audio'
23
+ Provides-Extra: video
24
+ Requires-Dist: decord>=0.6.0; extra == 'video'
25
+ Requires-Dist: opencv-python-headless>=4.5.0; extra == 'video'
26
+ Requires-Dist: pytorchvideo>=0.1.5; extra == 'video'
27
+ Requires-Dist: torchvision>=0.15.0; extra == 'video'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # languagebind
31
+
32
+ [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
33
+
34
+ The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install languagebind
40
+ ```
41
+
42
+ For video support:
43
+ ```bash
44
+ pip install "languagebind[video]"
45
+ ```
46
+
47
+ For audio support:
48
+ ```bash
49
+ pip install "languagebind[audio]"
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ```python
55
+ from languagebind import (
56
+ LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
57
+ LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
58
+ LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
59
+ )
60
+ ```
61
+
62
+ ## Compatibility patches
63
+
64
+ - `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
65
+ - `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
66
+ - `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
67
+ - `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
68
+
69
+ ## License
70
+
71
+ MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).
@@ -0,0 +1,42 @@
1
+ # languagebind
2
+
3
+ [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
4
+
5
+ The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install languagebind
11
+ ```
12
+
13
+ For video support:
14
+ ```bash
15
+ pip install "languagebind[video]"
16
+ ```
17
+
18
+ For audio support:
19
+ ```bash
20
+ pip install "languagebind[audio]"
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```python
26
+ from languagebind import (
27
+ LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
28
+ LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
29
+ LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
30
+ )
31
+ ```
32
+
33
+ ## Compatibility patches
34
+
35
+ - `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
36
+ - `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
37
+ - `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
38
+ - `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
39
+
40
+ ## License
41
+
42
+ MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).
@@ -0,0 +1,91 @@
1
+ import torch
2
+ from torch import nn
3
+ from transformers import AutoConfig
4
+
5
+ from .image.configuration_image import LanguageBindImageConfig
6
+ from .image.modeling_image import LanguageBindImage
7
+ from .image.tokenization_image import LanguageBindImageTokenizer
8
+ from .image.processing_image import LanguageBindImageProcessor
9
+
10
+ from .video.configuration_video import LanguageBindVideoConfig
11
+ from .video.modeling_video import LanguageBindVideo
12
+ from .video.tokenization_video import LanguageBindVideoTokenizer
13
+ from .video.processing_video import LanguageBindVideoProcessor
14
+
15
+ from .depth.configuration_depth import LanguageBindDepthConfig
16
+ from .depth.modeling_depth import LanguageBindDepth
17
+ from .depth.tokenization_depth import LanguageBindDepthTokenizer
18
+ from .depth.processing_depth import LanguageBindDepthProcessor
19
+
20
+ from .audio.configuration_audio import LanguageBindAudioConfig
21
+ from .audio.modeling_audio import LanguageBindAudio
22
+ from .audio.tokenization_audio import LanguageBindAudioTokenizer
23
+ from .audio.processing_audio import LanguageBindAudioProcessor
24
+
25
+ from .thermal.configuration_thermal import LanguageBindThermalConfig
26
+ from .thermal.modeling_thermal import LanguageBindThermal
27
+ from .thermal.tokenization_thermal import LanguageBindThermalTokenizer
28
+ from .thermal.processing_thermal import LanguageBindThermalProcessor
29
+
30
+
31
+
32
+ config_dict = {
33
+ 'thermal': LanguageBindThermalConfig,
34
+ 'image': LanguageBindImageConfig,
35
+ 'video': LanguageBindVideoConfig,
36
+ 'depth': LanguageBindDepthConfig,
37
+ 'audio': LanguageBindAudioConfig
38
+ }
39
+ model_dict = {
40
+ 'thermal': LanguageBindThermal,
41
+ 'image': LanguageBindImage,
42
+ 'video': LanguageBindVideo,
43
+ 'depth': LanguageBindDepth,
44
+ 'audio': LanguageBindAudio
45
+ }
46
+ transform_dict = {
47
+ 'video': LanguageBindVideoProcessor,
48
+ 'audio': LanguageBindAudioProcessor,
49
+ 'depth': LanguageBindDepthProcessor,
50
+ 'thermal': LanguageBindThermalProcessor,
51
+ 'image': LanguageBindImageProcessor,
52
+ }
53
+
54
+ class LanguageBind(nn.Module):
55
+ def __init__(self, clip_type, use_temp=True, cache_dir='./cache_dir'):
56
+ super(LanguageBind, self).__init__()
57
+ self.use_temp = use_temp
58
+ self.modality_encoder = {}
59
+ self.modality_proj = {}
60
+ self.modality_scale = {}
61
+ self.modality_config = {}
62
+ for k, v in clip_type.items():
63
+ pretrained_ckpt = f'LanguageBind/{v}'
64
+ model = model_dict[k].from_pretrained(pretrained_ckpt, cache_dir=cache_dir)
65
+ self.modality_encoder[k] = model.vision_model
66
+ self.modality_proj[k] = model.visual_projection
67
+ self.modality_scale[k] = model.logit_scale
68
+ self.modality_config[k] = model.config
69
+ self.modality_encoder['language'] = model.text_model
70
+ self.modality_proj['language'] = model.text_projection
71
+
72
+ self.modality_encoder = nn.ModuleDict(self.modality_encoder)
73
+ self.modality_proj = nn.ModuleDict(self.modality_proj)
74
+
75
+ def forward(self, inputs):
76
+ outputs = {}
77
+ for key, value in inputs.items():
78
+ value = self.modality_encoder[key](**value)[1]
79
+ value = self.modality_proj[key](value)
80
+ value = value / value.norm(p=2, dim=-1, keepdim=True)
81
+ if self.use_temp:
82
+ if key != 'language':
83
+ value = value * self.modality_scale[key].exp()
84
+ outputs[key] = value
85
+ return outputs
86
+
87
+ def to_device(x, device):
88
+ out_dict = {k: v.to(device) for k, v in x.items()}
89
+ return out_dict
90
+
91
+
@@ -0,0 +1,24 @@
1
+ """Compatibility shims for symbols removed from newer versions of transformers/torchaudio/torchvision."""
2
+ from __future__ import annotations
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+
7
+ try:
8
+ from transformers.models.clip.modeling_clip import _expand_mask
9
+ except ImportError:
10
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None) -> torch.Tensor:
11
+ bsz, src_len = mask.size()
12
+ tgt_len = tgt_len if tgt_len is not None else src_len
13
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
14
+ inverted_mask = 1.0 - expanded_mask
15
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
16
+
17
+ try:
18
+ from transformers.models.clip.modeling_clip import clip_loss
19
+ except ImportError:
20
+ def _contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
21
+ return F.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
22
+
23
+ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
24
+ return (_contrastive_loss(similarity) + _contrastive_loss(similarity.t())) / 2.0
File without changes
@@ -0,0 +1,420 @@
1
+ import copy
2
+ import os
3
+ from typing import Union
4
+
5
+ from transformers import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+ class CLIPTextConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
19
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
20
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
21
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
22
+
23
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
24
+ documentation from [`PretrainedConfig`] for more information.
25
+
26
+ Args:
27
+ vocab_size (`int`, *optional*, defaults to 49408):
28
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
29
+ the `inputs_ids` passed when calling [`CLIPModel`].
30
+ hidden_size (`int`, *optional*, defaults to 512):
31
+ Dimensionality of the encoder layers and the pooler layer.
32
+ intermediate_size (`int`, *optional*, defaults to 2048):
33
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
34
+ num_hidden_layers (`int`, *optional*, defaults to 12):
35
+ Number of hidden layers in the Transformer encoder.
36
+ num_attention_heads (`int`, *optional*, defaults to 8):
37
+ Number of attention heads for each attention layer in the Transformer encoder.
38
+ max_position_embeddings (`int`, *optional*, defaults to 77):
39
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
40
+ just in case (e.g., 512 or 1024 or 2048).
41
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
42
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
43
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
44
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
45
+ The epsilon used by the layer normalization layers.
46
+ attention_dropout (`float`, *optional*, defaults to 0.0):
47
+ The dropout ratio for the attention probabilities.
48
+ initializer_range (`float`, *optional*, defaults to 0.02):
49
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50
+ initializer_factor (`float`, *optional*, defaults to 1):
51
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
52
+ testing).
53
+
54
+ Example:
55
+
56
+ ```python
57
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
58
+
59
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
60
+ >>> configuration = CLIPTextConfig()
61
+
62
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
63
+ >>> model = CLIPTextModel(configuration)
64
+
65
+ >>> # Accessing the model configuration
66
+ >>> configuration = model.config
67
+ ```"""
68
+ model_type = "clip_text_model"
69
+
70
+ def __init__(
71
+ self,
72
+ vocab_size=49408,
73
+ hidden_size=512,
74
+ intermediate_size=2048,
75
+ projection_dim=512,
76
+ num_hidden_layers=12,
77
+ num_attention_heads=8,
78
+ max_position_embeddings=77,
79
+ hidden_act="quick_gelu",
80
+ layer_norm_eps=1e-5,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=1.0,
84
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
85
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
86
+ pad_token_id=1,
87
+ bos_token_id=49406,
88
+ eos_token_id=49407,
89
+ **kwargs,
90
+ ):
91
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
92
+
93
+ self.vocab_size = vocab_size
94
+ self.hidden_size = hidden_size
95
+ self.intermediate_size = intermediate_size
96
+ self.projection_dim = projection_dim
97
+ self.num_hidden_layers = num_hidden_layers
98
+ self.num_attention_heads = num_attention_heads
99
+ self.max_position_embeddings = max_position_embeddings
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.initializer_range = initializer_range
103
+ self.initializer_factor = initializer_factor
104
+ self.attention_dropout = attention_dropout
105
+ self.add_time_attn = False ######################################
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
109
+ cls._set_token_in_kwargs(kwargs)
110
+
111
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
112
+
113
+ # get the text config dict if we are loading from CLIPConfig
114
+ if config_dict.get("model_type") == "clip":
115
+ config_dict = config_dict["text_config"]
116
+
117
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
118
+ logger.warning(
119
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
120
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
121
+ )
122
+
123
+ return cls.from_dict(config_dict, **kwargs)
124
+
125
+
126
+
127
+
128
+ class CLIPVisionConfig(PretrainedConfig):
129
+ r"""
130
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
131
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
132
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
133
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
134
+
135
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
136
+ documentation from [`PretrainedConfig`] for more information.
137
+
138
+ Args:
139
+ hidden_size (`int`, *optional*, defaults to 768):
140
+ Dimensionality of the encoder layers and the pooler layer.
141
+ intermediate_size (`int`, *optional*, defaults to 3072):
142
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
143
+ num_hidden_layers (`int`, *optional*, defaults to 12):
144
+ Number of hidden layers in the Transformer encoder.
145
+ num_attention_heads (`int`, *optional*, defaults to 12):
146
+ Number of attention heads for each attention layer in the Transformer encoder.
147
+ image_size (`int`, *optional*, defaults to 224):
148
+ The size (resolution) of each image.
149
+ patch_size (`int`, *optional*, defaults to 32):
150
+ The size (resolution) of each patch.
151
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
154
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
155
+ The epsilon used by the layer normalization layers.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ initializer_range (`float`, *optional*, defaults to 0.02):
159
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
160
+ initializer_factor (`float`, *optional*, defaults to 1):
161
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
162
+ testing).
163
+
164
+ Example:
165
+
166
+ ```python
167
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
168
+
169
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
170
+ >>> configuration = CLIPVisionConfig()
171
+
172
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
173
+ >>> model = CLIPVisionModel(configuration)
174
+
175
+ >>> # Accessing the model configuration
176
+ >>> configuration = model.config
177
+ ```"""
178
+
179
+ model_type = "clip_vision_model"
180
+
181
+ def __init__(
182
+ self,
183
+ hidden_size=768,
184
+ intermediate_size=3072,
185
+ projection_dim=512,
186
+ num_hidden_layers=12,
187
+ num_attention_heads=12,
188
+ num_channels=3,
189
+ image_size=224,
190
+ patch_size=32,
191
+ hidden_act="quick_gelu",
192
+ layer_norm_eps=1e-5,
193
+ attention_dropout=0.0,
194
+ initializer_range=0.02,
195
+ initializer_factor=1.0,
196
+
197
+ add_time_attn=False, ################################
198
+ num_frames=1, ################################
199
+ force_patch_dropout=0.0, ################################
200
+ lora_r=2, ################################
201
+ lora_alpha=16, ################################
202
+ lora_dropout=0.0, ################################
203
+ num_mel_bins=0.0, ################################
204
+ target_length=0.0, ################################
205
+ video_decode_backend='decord', #########################
206
+ audio_sample_rate=16000,
207
+ audio_mean=0.5,
208
+ audio_std=0.5,
209
+ **kwargs,
210
+ ):
211
+ super().__init__(**kwargs)
212
+
213
+ self.hidden_size = hidden_size
214
+ self.intermediate_size = intermediate_size
215
+ self.projection_dim = projection_dim
216
+ self.num_hidden_layers = num_hidden_layers
217
+ self.num_attention_heads = num_attention_heads
218
+ self.num_channels = num_channels
219
+ self.patch_size = patch_size
220
+ self.image_size = image_size
221
+ self.initializer_range = initializer_range
222
+ self.initializer_factor = initializer_factor
223
+ self.attention_dropout = attention_dropout
224
+ self.layer_norm_eps = layer_norm_eps
225
+ self.hidden_act = hidden_act
226
+
227
+ self.add_time_attn = add_time_attn ################
228
+ self.num_frames = num_frames ################
229
+ self.force_patch_dropout = force_patch_dropout ################
230
+ self.lora_r = lora_r ################
231
+ self.lora_alpha = lora_alpha ################
232
+ self.lora_dropout = lora_dropout ################
233
+ self.num_mel_bins = num_mel_bins ################
234
+ self.target_length = target_length ################
235
+ self.video_decode_backend = video_decode_backend ################
236
+
237
+ self.audio_sample_rate = audio_sample_rate
238
+ self.audio_mean = audio_mean
239
+ self.audio_std = audio_std
240
+
241
+ @classmethod
242
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
243
+ cls._set_token_in_kwargs(kwargs)
244
+
245
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
246
+
247
+ # get the vision config dict if we are loading from CLIPConfig
248
+ if config_dict.get("model_type") == "clip":
249
+ config_dict = config_dict["vision_config"]
250
+
251
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
252
+ logger.warning(
253
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
254
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
255
+ )
256
+
257
+ return cls.from_dict(config_dict, **kwargs)
258
+
259
+
260
+ class LanguageBindAudioConfig(PretrainedConfig):
261
+ r"""
262
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
263
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
264
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
265
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
266
+
267
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
268
+ documentation from [`PretrainedConfig`] for more information.
269
+
270
+ Args:
271
+ text_config (`dict`, *optional*):
272
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
273
+ vision_config (`dict`, *optional*):
274
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
275
+ projection_dim (`int`, *optional*, defaults to 512):
276
+ Dimentionality of text and vision projection layers.
277
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
278
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
279
+ kwargs (*optional*):
280
+ Dictionary of keyword arguments.
281
+
282
+ Example:
283
+
284
+ ```python
285
+ >>> from transformers import CLIPConfig, CLIPModel
286
+
287
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
288
+ >>> configuration = CLIPConfig()
289
+
290
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
291
+ >>> model = CLIPModel(configuration)
292
+
293
+ >>> # Accessing the model configuration
294
+ >>> configuration = model.config
295
+
296
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
297
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
298
+
299
+ >>> # Initializing a CLIPText and CLIPVision configuration
300
+ >>> config_text = CLIPTextConfig()
301
+ >>> config_vision = CLIPVisionConfig()
302
+
303
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
304
+ ```"""
305
+
306
+ model_type = "LanguageBindAudio"
307
+ is_composition = True
308
+
309
+ def __init__(
310
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
311
+ ):
312
+ # If `_config_dict` exist, we use them for the backward compatibility.
313
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
314
+ # of confusion!).
315
+ text_config_dict = kwargs.pop("text_config_dict", None)
316
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
317
+
318
+ super().__init__(**kwargs)
319
+
320
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
321
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
322
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
323
+ if text_config_dict is not None:
324
+ if text_config is None:
325
+ text_config = {}
326
+
327
+ # This is the complete result when using `text_config_dict`.
328
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
329
+
330
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
331
+ for key, value in _text_config_dict.items():
332
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
333
+ # If specified in `text_config_dict`
334
+ if key in text_config_dict:
335
+ message = (
336
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
337
+ f'The value `text_config_dict["{key}"]` will be used instead.'
338
+ )
339
+ # If inferred from default argument values (just to be super careful)
340
+ else:
341
+ message = (
342
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
343
+ f'value `text_config["{key}"]` will be overriden.'
344
+ )
345
+ logger.warning(message)
346
+
347
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
348
+ text_config.update(_text_config_dict)
349
+
350
+ if vision_config_dict is not None:
351
+ if vision_config is None:
352
+ vision_config = {}
353
+
354
+ # This is the complete result when using `vision_config_dict`.
355
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
356
+ # convert keys to string instead of integer
357
+ if "id2label" in _vision_config_dict:
358
+ _vision_config_dict["id2label"] = {
359
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
360
+ }
361
+
362
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
363
+ for key, value in _vision_config_dict.items():
364
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
365
+ # If specified in `vision_config_dict`
366
+ if key in vision_config_dict:
367
+ message = (
368
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
369
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
370
+ )
371
+ # If inferred from default argument values (just to be super careful)
372
+ else:
373
+ message = (
374
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
375
+ f'The value `vision_config["{key}"]` will be overriden.'
376
+ )
377
+ logger.warning(message)
378
+
379
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
380
+ vision_config.update(_vision_config_dict)
381
+
382
+ if text_config is None:
383
+ text_config = {}
384
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
385
+
386
+ if vision_config is None:
387
+ vision_config = {}
388
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
389
+
390
+ self.text_config = CLIPTextConfig(**text_config)
391
+ self.vision_config = CLIPVisionConfig(**vision_config)
392
+
393
+ self.projection_dim = projection_dim
394
+ self.logit_scale_init_value = logit_scale_init_value
395
+ self.initializer_factor = 1.0
396
+
397
+ @classmethod
398
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
399
+ r"""
400
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
401
+ configuration.
402
+
403
+ Returns:
404
+ [`CLIPConfig`]: An instance of a configuration object
405
+ """
406
+
407
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
408
+
409
+ def to_dict(self):
410
+ """
411
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
412
+
413
+ Returns:
414
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
415
+ """
416
+ output = copy.deepcopy(self.__dict__)
417
+ output["text_config"] = self.text_config.to_dict()
418
+ output["vision_config"] = self.vision_config.to_dict()
419
+ output["model_type"] = self.__class__.model_type
420
+ return output