languagebind 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- languagebind-0.1.0/PKG-INFO +71 -0
- languagebind-0.1.0/README.md +42 -0
- languagebind-0.1.0/languagebind/__init__.py +91 -0
- languagebind-0.1.0/languagebind/_compat.py +24 -0
- languagebind-0.1.0/languagebind/audio/__init__.py +0 -0
- languagebind-0.1.0/languagebind/audio/configuration_audio.py +420 -0
- languagebind-0.1.0/languagebind/audio/modeling_audio.py +1031 -0
- languagebind-0.1.0/languagebind/audio/processing_audio.py +174 -0
- languagebind-0.1.0/languagebind/audio/tokenization_audio.py +78 -0
- languagebind-0.1.0/languagebind/depth/__init__.py +0 -0
- languagebind-0.1.0/languagebind/depth/configuration_depth.py +415 -0
- languagebind-0.1.0/languagebind/depth/modeling_depth.py +1031 -0
- languagebind-0.1.0/languagebind/depth/processing_depth.py +108 -0
- languagebind-0.1.0/languagebind/depth/tokenization_depth.py +78 -0
- languagebind-0.1.0/languagebind/image/__init__.py +0 -0
- languagebind-0.1.0/languagebind/image/configuration_image.py +413 -0
- languagebind-0.1.0/languagebind/image/modeling_image.py +1031 -0
- languagebind-0.1.0/languagebind/image/processing_image.py +77 -0
- languagebind-0.1.0/languagebind/image/tokenization_image.py +78 -0
- languagebind-0.1.0/languagebind/thermal/__init__.py +0 -0
- languagebind-0.1.0/languagebind/thermal/configuration_thermal.py +413 -0
- languagebind-0.1.0/languagebind/thermal/modeling_thermal.py +1031 -0
- languagebind-0.1.0/languagebind/thermal/processing_thermal.py +77 -0
- languagebind-0.1.0/languagebind/thermal/tokenization_thermal.py +78 -0
- languagebind-0.1.0/languagebind/video/__init__.py +0 -0
- languagebind-0.1.0/languagebind/video/configuration_video.py +413 -0
- languagebind-0.1.0/languagebind/video/modeling_video.py +1143 -0
- languagebind-0.1.0/languagebind/video/processing_video.py +174 -0
- languagebind-0.1.0/languagebind/video/tokenization_video.py +78 -0
- languagebind-0.1.0/makefile +9 -0
- languagebind-0.1.0/pyproject.toml +39 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: languagebind
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LanguageBind: multimodal (video/audio/image/depth/thermal) embedding model aligned to text via language-based semantic alignment. Packaged for pip-installability with compatibility patches for modern transformers/torchvision/torchaudio.
|
|
5
|
+
Project-URL: Homepage, https://github.com/PKU-YuanGroup/LanguageBind
|
|
6
|
+
Project-URL: Source, https://github.com/embeddings-benchmark/languagebind
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Requires-Dist: einops>=0.8.0
|
|
10
|
+
Requires-Dist: peft>=0.11.0
|
|
11
|
+
Requires-Dist: torch>=2.0.0
|
|
12
|
+
Requires-Dist: transformers<5.0.0,>=4.40.0
|
|
13
|
+
Provides-Extra: all
|
|
14
|
+
Requires-Dist: decord>=0.6.0; extra == 'all'
|
|
15
|
+
Requires-Dist: opencv-python-headless>=4.5.0; extra == 'all'
|
|
16
|
+
Requires-Dist: pytorchvideo>=0.1.5; extra == 'all'
|
|
17
|
+
Requires-Dist: soundfile>=0.12.0; extra == 'all'
|
|
18
|
+
Requires-Dist: torchaudio>=0.13.0; extra == 'all'
|
|
19
|
+
Requires-Dist: torchvision>=0.15.0; extra == 'all'
|
|
20
|
+
Provides-Extra: audio
|
|
21
|
+
Requires-Dist: soundfile>=0.12.0; extra == 'audio'
|
|
22
|
+
Requires-Dist: torchaudio>=0.13.0; extra == 'audio'
|
|
23
|
+
Provides-Extra: video
|
|
24
|
+
Requires-Dist: decord>=0.6.0; extra == 'video'
|
|
25
|
+
Requires-Dist: opencv-python-headless>=4.5.0; extra == 'video'
|
|
26
|
+
Requires-Dist: pytorchvideo>=0.1.5; extra == 'video'
|
|
27
|
+
Requires-Dist: torchvision>=0.15.0; extra == 'video'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# languagebind
|
|
31
|
+
|
|
32
|
+
[LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
|
|
33
|
+
|
|
34
|
+
The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install languagebind
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
For video support:
|
|
43
|
+
```bash
|
|
44
|
+
pip install "languagebind[video]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
For audio support:
|
|
48
|
+
```bash
|
|
49
|
+
pip install "languagebind[audio]"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from languagebind import (
|
|
56
|
+
LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
|
|
57
|
+
LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
|
|
58
|
+
LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Compatibility patches
|
|
63
|
+
|
|
64
|
+
- `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
|
|
65
|
+
- `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
|
|
66
|
+
- `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
|
|
67
|
+
- `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# languagebind
|
|
2
|
+
|
|
3
|
+
[LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
|
|
4
|
+
|
|
5
|
+
The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install languagebind
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
For video support:
|
|
14
|
+
```bash
|
|
15
|
+
pip install "languagebind[video]"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
For audio support:
|
|
19
|
+
```bash
|
|
20
|
+
pip install "languagebind[audio]"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from languagebind import (
|
|
27
|
+
LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
|
|
28
|
+
LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
|
|
29
|
+
LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
|
|
30
|
+
)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Compatibility patches
|
|
34
|
+
|
|
35
|
+
- `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
|
|
36
|
+
- `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
|
|
37
|
+
- `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
|
|
38
|
+
- `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
|
|
39
|
+
|
|
40
|
+
## License
|
|
41
|
+
|
|
42
|
+
MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch import nn
|
|
3
|
+
from transformers import AutoConfig
|
|
4
|
+
|
|
5
|
+
from .image.configuration_image import LanguageBindImageConfig
|
|
6
|
+
from .image.modeling_image import LanguageBindImage
|
|
7
|
+
from .image.tokenization_image import LanguageBindImageTokenizer
|
|
8
|
+
from .image.processing_image import LanguageBindImageProcessor
|
|
9
|
+
|
|
10
|
+
from .video.configuration_video import LanguageBindVideoConfig
|
|
11
|
+
from .video.modeling_video import LanguageBindVideo
|
|
12
|
+
from .video.tokenization_video import LanguageBindVideoTokenizer
|
|
13
|
+
from .video.processing_video import LanguageBindVideoProcessor
|
|
14
|
+
|
|
15
|
+
from .depth.configuration_depth import LanguageBindDepthConfig
|
|
16
|
+
from .depth.modeling_depth import LanguageBindDepth
|
|
17
|
+
from .depth.tokenization_depth import LanguageBindDepthTokenizer
|
|
18
|
+
from .depth.processing_depth import LanguageBindDepthProcessor
|
|
19
|
+
|
|
20
|
+
from .audio.configuration_audio import LanguageBindAudioConfig
|
|
21
|
+
from .audio.modeling_audio import LanguageBindAudio
|
|
22
|
+
from .audio.tokenization_audio import LanguageBindAudioTokenizer
|
|
23
|
+
from .audio.processing_audio import LanguageBindAudioProcessor
|
|
24
|
+
|
|
25
|
+
from .thermal.configuration_thermal import LanguageBindThermalConfig
|
|
26
|
+
from .thermal.modeling_thermal import LanguageBindThermal
|
|
27
|
+
from .thermal.tokenization_thermal import LanguageBindThermalTokenizer
|
|
28
|
+
from .thermal.processing_thermal import LanguageBindThermalProcessor
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
config_dict = {
|
|
33
|
+
'thermal': LanguageBindThermalConfig,
|
|
34
|
+
'image': LanguageBindImageConfig,
|
|
35
|
+
'video': LanguageBindVideoConfig,
|
|
36
|
+
'depth': LanguageBindDepthConfig,
|
|
37
|
+
'audio': LanguageBindAudioConfig
|
|
38
|
+
}
|
|
39
|
+
model_dict = {
|
|
40
|
+
'thermal': LanguageBindThermal,
|
|
41
|
+
'image': LanguageBindImage,
|
|
42
|
+
'video': LanguageBindVideo,
|
|
43
|
+
'depth': LanguageBindDepth,
|
|
44
|
+
'audio': LanguageBindAudio
|
|
45
|
+
}
|
|
46
|
+
transform_dict = {
|
|
47
|
+
'video': LanguageBindVideoProcessor,
|
|
48
|
+
'audio': LanguageBindAudioProcessor,
|
|
49
|
+
'depth': LanguageBindDepthProcessor,
|
|
50
|
+
'thermal': LanguageBindThermalProcessor,
|
|
51
|
+
'image': LanguageBindImageProcessor,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
class LanguageBind(nn.Module):
|
|
55
|
+
def __init__(self, clip_type, use_temp=True, cache_dir='./cache_dir'):
|
|
56
|
+
super(LanguageBind, self).__init__()
|
|
57
|
+
self.use_temp = use_temp
|
|
58
|
+
self.modality_encoder = {}
|
|
59
|
+
self.modality_proj = {}
|
|
60
|
+
self.modality_scale = {}
|
|
61
|
+
self.modality_config = {}
|
|
62
|
+
for k, v in clip_type.items():
|
|
63
|
+
pretrained_ckpt = f'LanguageBind/{v}'
|
|
64
|
+
model = model_dict[k].from_pretrained(pretrained_ckpt, cache_dir=cache_dir)
|
|
65
|
+
self.modality_encoder[k] = model.vision_model
|
|
66
|
+
self.modality_proj[k] = model.visual_projection
|
|
67
|
+
self.modality_scale[k] = model.logit_scale
|
|
68
|
+
self.modality_config[k] = model.config
|
|
69
|
+
self.modality_encoder['language'] = model.text_model
|
|
70
|
+
self.modality_proj['language'] = model.text_projection
|
|
71
|
+
|
|
72
|
+
self.modality_encoder = nn.ModuleDict(self.modality_encoder)
|
|
73
|
+
self.modality_proj = nn.ModuleDict(self.modality_proj)
|
|
74
|
+
|
|
75
|
+
def forward(self, inputs):
|
|
76
|
+
outputs = {}
|
|
77
|
+
for key, value in inputs.items():
|
|
78
|
+
value = self.modality_encoder[key](**value)[1]
|
|
79
|
+
value = self.modality_proj[key](value)
|
|
80
|
+
value = value / value.norm(p=2, dim=-1, keepdim=True)
|
|
81
|
+
if self.use_temp:
|
|
82
|
+
if key != 'language':
|
|
83
|
+
value = value * self.modality_scale[key].exp()
|
|
84
|
+
outputs[key] = value
|
|
85
|
+
return outputs
|
|
86
|
+
|
|
87
|
+
def to_device(x, device):
|
|
88
|
+
out_dict = {k: v.to(device) for k, v in x.items()}
|
|
89
|
+
return out_dict
|
|
90
|
+
|
|
91
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Compatibility shims for symbols removed from newer versions of transformers/torchaudio/torchvision."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from transformers.models.clip.modeling_clip import _expand_mask
|
|
9
|
+
except ImportError:
|
|
10
|
+
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int | None = None) -> torch.Tensor:
|
|
11
|
+
bsz, src_len = mask.size()
|
|
12
|
+
tgt_len = tgt_len if tgt_len is not None else src_len
|
|
13
|
+
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
|
14
|
+
inverted_mask = 1.0 - expanded_mask
|
|
15
|
+
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from transformers.models.clip.modeling_clip import clip_loss
|
|
19
|
+
except ImportError:
|
|
20
|
+
def _contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
|
|
21
|
+
return F.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
|
|
22
|
+
|
|
23
|
+
def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
|
24
|
+
return (_contrastive_loss(similarity) + _contrastive_loss(similarity.t())) / 2.0
|
|
File without changes
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from transformers import PretrainedConfig
|
|
6
|
+
from transformers.utils import logging
|
|
7
|
+
|
|
8
|
+
logger = logging.get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CLIPTextConfig(PretrainedConfig):
|
|
17
|
+
r"""
|
|
18
|
+
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
|
|
19
|
+
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
20
|
+
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
|
|
21
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
22
|
+
|
|
23
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
24
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
vocab_size (`int`, *optional*, defaults to 49408):
|
|
28
|
+
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
|
|
29
|
+
the `inputs_ids` passed when calling [`CLIPModel`].
|
|
30
|
+
hidden_size (`int`, *optional*, defaults to 512):
|
|
31
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
32
|
+
intermediate_size (`int`, *optional*, defaults to 2048):
|
|
33
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
34
|
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
|
35
|
+
Number of hidden layers in the Transformer encoder.
|
|
36
|
+
num_attention_heads (`int`, *optional*, defaults to 8):
|
|
37
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
38
|
+
max_position_embeddings (`int`, *optional*, defaults to 77):
|
|
39
|
+
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
|
40
|
+
just in case (e.g., 512 or 1024 or 2048).
|
|
41
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
|
42
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
43
|
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
|
44
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
|
45
|
+
The epsilon used by the layer normalization layers.
|
|
46
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
47
|
+
The dropout ratio for the attention probabilities.
|
|
48
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
49
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
50
|
+
initializer_factor (`float`, *optional*, defaults to 1):
|
|
51
|
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
|
52
|
+
testing).
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
>>> from transformers import CLIPTextConfig, CLIPTextModel
|
|
58
|
+
|
|
59
|
+
>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
|
|
60
|
+
>>> configuration = CLIPTextConfig()
|
|
61
|
+
|
|
62
|
+
>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
63
|
+
>>> model = CLIPTextModel(configuration)
|
|
64
|
+
|
|
65
|
+
>>> # Accessing the model configuration
|
|
66
|
+
>>> configuration = model.config
|
|
67
|
+
```"""
|
|
68
|
+
model_type = "clip_text_model"
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
vocab_size=49408,
|
|
73
|
+
hidden_size=512,
|
|
74
|
+
intermediate_size=2048,
|
|
75
|
+
projection_dim=512,
|
|
76
|
+
num_hidden_layers=12,
|
|
77
|
+
num_attention_heads=8,
|
|
78
|
+
max_position_embeddings=77,
|
|
79
|
+
hidden_act="quick_gelu",
|
|
80
|
+
layer_norm_eps=1e-5,
|
|
81
|
+
attention_dropout=0.0,
|
|
82
|
+
initializer_range=0.02,
|
|
83
|
+
initializer_factor=1.0,
|
|
84
|
+
# This differs from `CLIPTokenizer`'s default and from openai/clip
|
|
85
|
+
# See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
|
|
86
|
+
pad_token_id=1,
|
|
87
|
+
bos_token_id=49406,
|
|
88
|
+
eos_token_id=49407,
|
|
89
|
+
**kwargs,
|
|
90
|
+
):
|
|
91
|
+
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
|
92
|
+
|
|
93
|
+
self.vocab_size = vocab_size
|
|
94
|
+
self.hidden_size = hidden_size
|
|
95
|
+
self.intermediate_size = intermediate_size
|
|
96
|
+
self.projection_dim = projection_dim
|
|
97
|
+
self.num_hidden_layers = num_hidden_layers
|
|
98
|
+
self.num_attention_heads = num_attention_heads
|
|
99
|
+
self.max_position_embeddings = max_position_embeddings
|
|
100
|
+
self.layer_norm_eps = layer_norm_eps
|
|
101
|
+
self.hidden_act = hidden_act
|
|
102
|
+
self.initializer_range = initializer_range
|
|
103
|
+
self.initializer_factor = initializer_factor
|
|
104
|
+
self.attention_dropout = attention_dropout
|
|
105
|
+
self.add_time_attn = False ######################################
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
109
|
+
cls._set_token_in_kwargs(kwargs)
|
|
110
|
+
|
|
111
|
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
112
|
+
|
|
113
|
+
# get the text config dict if we are loading from CLIPConfig
|
|
114
|
+
if config_dict.get("model_type") == "clip":
|
|
115
|
+
config_dict = config_dict["text_config"]
|
|
116
|
+
|
|
117
|
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
118
|
+
logger.warning(
|
|
119
|
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
120
|
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return cls.from_dict(config_dict, **kwargs)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class CLIPVisionConfig(PretrainedConfig):
|
|
129
|
+
r"""
|
|
130
|
+
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
|
|
131
|
+
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
|
132
|
+
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
|
|
133
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
134
|
+
|
|
135
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
136
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
hidden_size (`int`, *optional*, defaults to 768):
|
|
140
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
141
|
+
intermediate_size (`int`, *optional*, defaults to 3072):
|
|
142
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
143
|
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
|
144
|
+
Number of hidden layers in the Transformer encoder.
|
|
145
|
+
num_attention_heads (`int`, *optional*, defaults to 12):
|
|
146
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
147
|
+
image_size (`int`, *optional*, defaults to 224):
|
|
148
|
+
The size (resolution) of each image.
|
|
149
|
+
patch_size (`int`, *optional*, defaults to 32):
|
|
150
|
+
The size (resolution) of each patch.
|
|
151
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
|
152
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
153
|
+
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
|
154
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
|
155
|
+
The epsilon used by the layer normalization layers.
|
|
156
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
157
|
+
The dropout ratio for the attention probabilities.
|
|
158
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
159
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
160
|
+
initializer_factor (`float`, *optional*, defaults to 1):
|
|
161
|
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
|
162
|
+
testing).
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
|
|
168
|
+
|
|
169
|
+
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
|
|
170
|
+
>>> configuration = CLIPVisionConfig()
|
|
171
|
+
|
|
172
|
+
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
173
|
+
>>> model = CLIPVisionModel(configuration)
|
|
174
|
+
|
|
175
|
+
>>> # Accessing the model configuration
|
|
176
|
+
>>> configuration = model.config
|
|
177
|
+
```"""
|
|
178
|
+
|
|
179
|
+
model_type = "clip_vision_model"
|
|
180
|
+
|
|
181
|
+
def __init__(
|
|
182
|
+
self,
|
|
183
|
+
hidden_size=768,
|
|
184
|
+
intermediate_size=3072,
|
|
185
|
+
projection_dim=512,
|
|
186
|
+
num_hidden_layers=12,
|
|
187
|
+
num_attention_heads=12,
|
|
188
|
+
num_channels=3,
|
|
189
|
+
image_size=224,
|
|
190
|
+
patch_size=32,
|
|
191
|
+
hidden_act="quick_gelu",
|
|
192
|
+
layer_norm_eps=1e-5,
|
|
193
|
+
attention_dropout=0.0,
|
|
194
|
+
initializer_range=0.02,
|
|
195
|
+
initializer_factor=1.0,
|
|
196
|
+
|
|
197
|
+
add_time_attn=False, ################################
|
|
198
|
+
num_frames=1, ################################
|
|
199
|
+
force_patch_dropout=0.0, ################################
|
|
200
|
+
lora_r=2, ################################
|
|
201
|
+
lora_alpha=16, ################################
|
|
202
|
+
lora_dropout=0.0, ################################
|
|
203
|
+
num_mel_bins=0.0, ################################
|
|
204
|
+
target_length=0.0, ################################
|
|
205
|
+
video_decode_backend='decord', #########################
|
|
206
|
+
audio_sample_rate=16000,
|
|
207
|
+
audio_mean=0.5,
|
|
208
|
+
audio_std=0.5,
|
|
209
|
+
**kwargs,
|
|
210
|
+
):
|
|
211
|
+
super().__init__(**kwargs)
|
|
212
|
+
|
|
213
|
+
self.hidden_size = hidden_size
|
|
214
|
+
self.intermediate_size = intermediate_size
|
|
215
|
+
self.projection_dim = projection_dim
|
|
216
|
+
self.num_hidden_layers = num_hidden_layers
|
|
217
|
+
self.num_attention_heads = num_attention_heads
|
|
218
|
+
self.num_channels = num_channels
|
|
219
|
+
self.patch_size = patch_size
|
|
220
|
+
self.image_size = image_size
|
|
221
|
+
self.initializer_range = initializer_range
|
|
222
|
+
self.initializer_factor = initializer_factor
|
|
223
|
+
self.attention_dropout = attention_dropout
|
|
224
|
+
self.layer_norm_eps = layer_norm_eps
|
|
225
|
+
self.hidden_act = hidden_act
|
|
226
|
+
|
|
227
|
+
self.add_time_attn = add_time_attn ################
|
|
228
|
+
self.num_frames = num_frames ################
|
|
229
|
+
self.force_patch_dropout = force_patch_dropout ################
|
|
230
|
+
self.lora_r = lora_r ################
|
|
231
|
+
self.lora_alpha = lora_alpha ################
|
|
232
|
+
self.lora_dropout = lora_dropout ################
|
|
233
|
+
self.num_mel_bins = num_mel_bins ################
|
|
234
|
+
self.target_length = target_length ################
|
|
235
|
+
self.video_decode_backend = video_decode_backend ################
|
|
236
|
+
|
|
237
|
+
self.audio_sample_rate = audio_sample_rate
|
|
238
|
+
self.audio_mean = audio_mean
|
|
239
|
+
self.audio_std = audio_std
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
243
|
+
cls._set_token_in_kwargs(kwargs)
|
|
244
|
+
|
|
245
|
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
246
|
+
|
|
247
|
+
# get the vision config dict if we are loading from CLIPConfig
|
|
248
|
+
if config_dict.get("model_type") == "clip":
|
|
249
|
+
config_dict = config_dict["vision_config"]
|
|
250
|
+
|
|
251
|
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
252
|
+
logger.warning(
|
|
253
|
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
254
|
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return cls.from_dict(config_dict, **kwargs)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class LanguageBindAudioConfig(PretrainedConfig):
|
|
261
|
+
r"""
|
|
262
|
+
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
|
|
263
|
+
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
|
|
264
|
+
a configuration with the defaults will yield a similar configuration to that of the CLIP
|
|
265
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
266
|
+
|
|
267
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
268
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
text_config (`dict`, *optional*):
|
|
272
|
+
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
|
|
273
|
+
vision_config (`dict`, *optional*):
|
|
274
|
+
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
|
275
|
+
projection_dim (`int`, *optional*, defaults to 512):
|
|
276
|
+
Dimentionality of text and vision projection layers.
|
|
277
|
+
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
|
278
|
+
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
|
|
279
|
+
kwargs (*optional*):
|
|
280
|
+
Dictionary of keyword arguments.
|
|
281
|
+
|
|
282
|
+
Example:
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
>>> from transformers import CLIPConfig, CLIPModel
|
|
286
|
+
|
|
287
|
+
>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
|
|
288
|
+
>>> configuration = CLIPConfig()
|
|
289
|
+
|
|
290
|
+
>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
291
|
+
>>> model = CLIPModel(configuration)
|
|
292
|
+
|
|
293
|
+
>>> # Accessing the model configuration
|
|
294
|
+
>>> configuration = model.config
|
|
295
|
+
|
|
296
|
+
>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
|
|
297
|
+
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
|
|
298
|
+
|
|
299
|
+
>>> # Initializing a CLIPText and CLIPVision configuration
|
|
300
|
+
>>> config_text = CLIPTextConfig()
|
|
301
|
+
>>> config_vision = CLIPVisionConfig()
|
|
302
|
+
|
|
303
|
+
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
|
|
304
|
+
```"""
|
|
305
|
+
|
|
306
|
+
model_type = "LanguageBindAudio"
|
|
307
|
+
is_composition = True
|
|
308
|
+
|
|
309
|
+
def __init__(
|
|
310
|
+
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
|
311
|
+
):
|
|
312
|
+
# If `_config_dict` exist, we use them for the backward compatibility.
|
|
313
|
+
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
|
|
314
|
+
# of confusion!).
|
|
315
|
+
text_config_dict = kwargs.pop("text_config_dict", None)
|
|
316
|
+
vision_config_dict = kwargs.pop("vision_config_dict", None)
|
|
317
|
+
|
|
318
|
+
super().__init__(**kwargs)
|
|
319
|
+
|
|
320
|
+
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
|
|
321
|
+
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
|
|
322
|
+
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
|
|
323
|
+
if text_config_dict is not None:
|
|
324
|
+
if text_config is None:
|
|
325
|
+
text_config = {}
|
|
326
|
+
|
|
327
|
+
# This is the complete result when using `text_config_dict`.
|
|
328
|
+
_text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
|
|
329
|
+
|
|
330
|
+
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
|
|
331
|
+
for key, value in _text_config_dict.items():
|
|
332
|
+
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
|
|
333
|
+
# If specified in `text_config_dict`
|
|
334
|
+
if key in text_config_dict:
|
|
335
|
+
message = (
|
|
336
|
+
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
|
|
337
|
+
f'The value `text_config_dict["{key}"]` will be used instead.'
|
|
338
|
+
)
|
|
339
|
+
# If inferred from default argument values (just to be super careful)
|
|
340
|
+
else:
|
|
341
|
+
message = (
|
|
342
|
+
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
|
343
|
+
f'value `text_config["{key}"]` will be overriden.'
|
|
344
|
+
)
|
|
345
|
+
logger.warning(message)
|
|
346
|
+
|
|
347
|
+
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
|
348
|
+
text_config.update(_text_config_dict)
|
|
349
|
+
|
|
350
|
+
if vision_config_dict is not None:
|
|
351
|
+
if vision_config is None:
|
|
352
|
+
vision_config = {}
|
|
353
|
+
|
|
354
|
+
# This is the complete result when using `vision_config_dict`.
|
|
355
|
+
_vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
|
|
356
|
+
# convert keys to string instead of integer
|
|
357
|
+
if "id2label" in _vision_config_dict:
|
|
358
|
+
_vision_config_dict["id2label"] = {
|
|
359
|
+
str(key): value for key, value in _vision_config_dict["id2label"].items()
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
|
|
363
|
+
for key, value in _vision_config_dict.items():
|
|
364
|
+
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
|
|
365
|
+
# If specified in `vision_config_dict`
|
|
366
|
+
if key in vision_config_dict:
|
|
367
|
+
message = (
|
|
368
|
+
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
|
|
369
|
+
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
|
|
370
|
+
)
|
|
371
|
+
# If inferred from default argument values (just to be super careful)
|
|
372
|
+
else:
|
|
373
|
+
message = (
|
|
374
|
+
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
|
375
|
+
f'The value `vision_config["{key}"]` will be overriden.'
|
|
376
|
+
)
|
|
377
|
+
logger.warning(message)
|
|
378
|
+
|
|
379
|
+
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
|
380
|
+
vision_config.update(_vision_config_dict)
|
|
381
|
+
|
|
382
|
+
if text_config is None:
|
|
383
|
+
text_config = {}
|
|
384
|
+
logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
|
|
385
|
+
|
|
386
|
+
if vision_config is None:
|
|
387
|
+
vision_config = {}
|
|
388
|
+
logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
|
|
389
|
+
|
|
390
|
+
self.text_config = CLIPTextConfig(**text_config)
|
|
391
|
+
self.vision_config = CLIPVisionConfig(**vision_config)
|
|
392
|
+
|
|
393
|
+
self.projection_dim = projection_dim
|
|
394
|
+
self.logit_scale_init_value = logit_scale_init_value
|
|
395
|
+
self.initializer_factor = 1.0
|
|
396
|
+
|
|
397
|
+
@classmethod
|
|
398
|
+
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
|
|
399
|
+
r"""
|
|
400
|
+
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
|
|
401
|
+
configuration.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
[`CLIPConfig`]: An instance of a configuration object
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
408
|
+
|
|
409
|
+
def to_dict(self):
|
|
410
|
+
"""
|
|
411
|
+
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
|
415
|
+
"""
|
|
416
|
+
output = copy.deepcopy(self.__dict__)
|
|
417
|
+
output["text_config"] = self.text_config.to_dict()
|
|
418
|
+
output["vision_config"] = self.vision_config.to_dict()
|
|
419
|
+
output["model_type"] = self.__class__.model_type
|
|
420
|
+
return output
|