languagebind 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- languagebind/__init__.py +91 -0
- languagebind/_compat.py +24 -0
- languagebind/audio/__init__.py +0 -0
- languagebind/audio/configuration_audio.py +420 -0
- languagebind/audio/modeling_audio.py +1031 -0
- languagebind/audio/processing_audio.py +174 -0
- languagebind/audio/tokenization_audio.py +78 -0
- languagebind/depth/__init__.py +0 -0
- languagebind/depth/configuration_depth.py +415 -0
- languagebind/depth/modeling_depth.py +1031 -0
- languagebind/depth/processing_depth.py +108 -0
- languagebind/depth/tokenization_depth.py +78 -0
- languagebind/image/__init__.py +0 -0
- languagebind/image/configuration_image.py +413 -0
- languagebind/image/modeling_image.py +1031 -0
- languagebind/image/processing_image.py +77 -0
- languagebind/image/tokenization_image.py +78 -0
- languagebind/thermal/__init__.py +0 -0
- languagebind/thermal/configuration_thermal.py +413 -0
- languagebind/thermal/modeling_thermal.py +1031 -0
- languagebind/thermal/processing_thermal.py +77 -0
- languagebind/thermal/tokenization_thermal.py +78 -0
- languagebind/video/__init__.py +0 -0
- languagebind/video/configuration_video.py +413 -0
- languagebind/video/modeling_video.py +1143 -0
- languagebind/video/processing_video.py +174 -0
- languagebind/video/tokenization_video.py +78 -0
- languagebind-0.1.0.dist-info/METADATA +71 -0
- languagebind-0.1.0.dist-info/RECORD +30 -0
- languagebind-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import cv2
|
|
2
|
+
import torch
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from torch import nn
|
|
5
|
+
from torchvision import transforms
|
|
6
|
+
from transformers import ProcessorMixin, BatchEncoding
|
|
7
|
+
from transformers.image_processing_utils import BatchFeature
|
|
8
|
+
|
|
9
|
+
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
|
10
|
+
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
|
|
11
|
+
|
|
12
|
+
def make_list_of_images(x):
|
|
13
|
+
if not isinstance(x, list):
|
|
14
|
+
return [x]
|
|
15
|
+
return x
|
|
16
|
+
|
|
17
|
+
def opencv_loader(path):
|
|
18
|
+
return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DepthNorm(nn.Module):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
max_depth=0,
|
|
25
|
+
min_depth=0.01,
|
|
26
|
+
):
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.max_depth = max_depth
|
|
29
|
+
self.min_depth = min_depth
|
|
30
|
+
self.scale = 1000.0 # nyuv2 abs.depth
|
|
31
|
+
|
|
32
|
+
def forward(self, image):
|
|
33
|
+
# image = np.array(image)
|
|
34
|
+
depth_img = image / self.scale # (H, W) in meters
|
|
35
|
+
depth_img = depth_img.clip(min=self.min_depth)
|
|
36
|
+
if self.max_depth != 0:
|
|
37
|
+
depth_img = depth_img.clip(max=self.max_depth)
|
|
38
|
+
depth_img /= self.max_depth # 0-1
|
|
39
|
+
else:
|
|
40
|
+
depth_img /= depth_img.max()
|
|
41
|
+
depth_img = torch.from_numpy(depth_img).unsqueeze(0).repeat(3, 1, 1) # assume image
|
|
42
|
+
return depth_img.to(torch.get_default_dtype())
|
|
43
|
+
|
|
44
|
+
def get_depth_transform(config):
|
|
45
|
+
config = config.vision_config
|
|
46
|
+
transform = transforms.Compose(
|
|
47
|
+
[
|
|
48
|
+
DepthNorm(max_depth=config.max_depth),
|
|
49
|
+
transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
|
|
50
|
+
transforms.CenterCrop(224),
|
|
51
|
+
transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD), # assume image
|
|
52
|
+
# transforms.Normalize((0.5, ), (0.5, )) # 0-1 to norm distribution
|
|
53
|
+
# transforms.Normalize((0.0418, ), (0.0295, )) # sun rgb-d imagebind
|
|
54
|
+
# transforms.Normalize((0.02, ), (0.00295, )) # nyuv2
|
|
55
|
+
]
|
|
56
|
+
)
|
|
57
|
+
return transform
|
|
58
|
+
|
|
59
|
+
def load_and_transform_depth(depth_path, transform):
|
|
60
|
+
depth = opencv_loader(depth_path)
|
|
61
|
+
depth_outputs = transform(depth)
|
|
62
|
+
return depth_outputs
|
|
63
|
+
|
|
64
|
+
class LanguageBindDepthProcessor(ProcessorMixin):
|
|
65
|
+
attributes = []
|
|
66
|
+
tokenizer_class = ("LanguageBindDepthTokenizer")
|
|
67
|
+
|
|
68
|
+
def __init__(self, config, tokenizer=None, **kwargs):
|
|
69
|
+
super().__init__(**kwargs)
|
|
70
|
+
self.config = config
|
|
71
|
+
self.transform = get_depth_transform(config)
|
|
72
|
+
self.image_processor = load_and_transform_depth
|
|
73
|
+
self.tokenizer = tokenizer
|
|
74
|
+
|
|
75
|
+
def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
|
|
76
|
+
if text is None and images is None:
|
|
77
|
+
raise ValueError("You have to specify either text or images. Both cannot be none.")
|
|
78
|
+
|
|
79
|
+
if text is not None:
|
|
80
|
+
encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
|
|
81
|
+
truncation=True, return_tensors=return_tensors, **kwargs)
|
|
82
|
+
|
|
83
|
+
if images is not None:
|
|
84
|
+
images = make_list_of_images(images)
|
|
85
|
+
image_features = [self.image_processor(image, self.transform) for image in images]
|
|
86
|
+
image_features = torch.stack(image_features)
|
|
87
|
+
|
|
88
|
+
if text is not None and images is not None:
|
|
89
|
+
encoding["pixel_values"] = image_features
|
|
90
|
+
return encoding
|
|
91
|
+
elif text is not None:
|
|
92
|
+
return encoding
|
|
93
|
+
else:
|
|
94
|
+
return {"pixel_values": image_features}
|
|
95
|
+
|
|
96
|
+
def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
|
|
97
|
+
"""
|
|
98
|
+
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
|
99
|
+
refer to the docstring of this method for more information.
|
|
100
|
+
"""
|
|
101
|
+
return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
|
|
102
|
+
|
|
103
|
+
def decode(self, skip_special_tokens=True, *args, **kwargs):
|
|
104
|
+
"""
|
|
105
|
+
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
|
106
|
+
the docstring of this method for more information.
|
|
107
|
+
"""
|
|
108
|
+
return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from transformers import CLIPTokenizer
|
|
2
|
+
from transformers.utils import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
VOCAB_FILES_NAMES = {
|
|
7
|
+
"vocab_file": "vocab.json",
|
|
8
|
+
"merges_file": "merges.txt",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
PRETRAINED_VOCAB_FILES_MAP = {
|
|
12
|
+
"vocab_file": {
|
|
13
|
+
"lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/vocab.json",
|
|
14
|
+
},
|
|
15
|
+
"merges_file": {
|
|
16
|
+
"lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/merges.txt",
|
|
17
|
+
},
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|
21
|
+
"lb203/LanguageBind-Depth": 77,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
PRETRAINED_INIT_CONFIGURATION = {
|
|
26
|
+
"lb203/LanguageBind-Thermal": {},
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
class LanguageBindDepthTokenizer(CLIPTokenizer):
|
|
30
|
+
"""
|
|
31
|
+
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
|
|
32
|
+
|
|
33
|
+
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
|
|
34
|
+
this superclass for more information regarding those methods.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
vocab_file (`str`):
|
|
38
|
+
Path to the vocabulary file.
|
|
39
|
+
merges_file (`str`):
|
|
40
|
+
Path to the merges file.
|
|
41
|
+
errors (`str`, *optional*, defaults to `"replace"`):
|
|
42
|
+
Paradigm to follow when decoding bytes to UTF-8. See
|
|
43
|
+
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
|
|
44
|
+
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
|
45
|
+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
46
|
+
token instead.
|
|
47
|
+
bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
|
|
48
|
+
The beginning of sequence token.
|
|
49
|
+
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
|
50
|
+
The end of sequence token.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
vocab_files_names = VOCAB_FILES_NAMES
|
|
54
|
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
|
55
|
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
|
56
|
+
model_input_names = ["input_ids", "attention_mask"]
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
vocab_file,
|
|
61
|
+
merges_file,
|
|
62
|
+
errors="replace",
|
|
63
|
+
unk_token="<|endoftext|>",
|
|
64
|
+
bos_token="<|startoftext|>",
|
|
65
|
+
eos_token="<|endoftext|>",
|
|
66
|
+
pad_token="<|endoftext|>", # hack to enable padding
|
|
67
|
+
**kwargs,
|
|
68
|
+
):
|
|
69
|
+
super(LanguageBindDepthTokenizer, self).__init__(
|
|
70
|
+
vocab_file=vocab_file,
|
|
71
|
+
merges_file=merges_file,
|
|
72
|
+
errors=errors,
|
|
73
|
+
unk_token=unk_token,
|
|
74
|
+
bos_token=bos_token,
|
|
75
|
+
eos_token=eos_token,
|
|
76
|
+
pad_token=pad_token,
|
|
77
|
+
**kwargs,
|
|
78
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from transformers import PretrainedConfig
|
|
6
|
+
from transformers.utils import logging
|
|
7
|
+
|
|
8
|
+
logger = logging.get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CLIPTextConfig(PretrainedConfig):
|
|
17
|
+
r"""
|
|
18
|
+
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
|
|
19
|
+
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
20
|
+
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
|
|
21
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
22
|
+
|
|
23
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
24
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
vocab_size (`int`, *optional*, defaults to 49408):
|
|
28
|
+
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
|
|
29
|
+
the `inputs_ids` passed when calling [`CLIPModel`].
|
|
30
|
+
hidden_size (`int`, *optional*, defaults to 512):
|
|
31
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
32
|
+
intermediate_size (`int`, *optional*, defaults to 2048):
|
|
33
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
34
|
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
|
35
|
+
Number of hidden layers in the Transformer encoder.
|
|
36
|
+
num_attention_heads (`int`, *optional*, defaults to 8):
|
|
37
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
38
|
+
max_position_embeddings (`int`, *optional*, defaults to 77):
|
|
39
|
+
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
|
40
|
+
just in case (e.g., 512 or 1024 or 2048).
|
|
41
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
|
42
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
43
|
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
|
44
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
|
45
|
+
The epsilon used by the layer normalization layers.
|
|
46
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
47
|
+
The dropout ratio for the attention probabilities.
|
|
48
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
49
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
50
|
+
initializer_factor (`float`, *optional*, defaults to 1):
|
|
51
|
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
|
52
|
+
testing).
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
>>> from transformers import CLIPTextConfig, CLIPTextModel
|
|
58
|
+
|
|
59
|
+
>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
|
|
60
|
+
>>> configuration = CLIPTextConfig()
|
|
61
|
+
|
|
62
|
+
>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
63
|
+
>>> model = CLIPTextModel(configuration)
|
|
64
|
+
|
|
65
|
+
>>> # Accessing the model configuration
|
|
66
|
+
>>> configuration = model.config
|
|
67
|
+
```"""
|
|
68
|
+
model_type = "clip_text_model"
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
vocab_size=49408,
|
|
73
|
+
hidden_size=512,
|
|
74
|
+
intermediate_size=2048,
|
|
75
|
+
projection_dim=512,
|
|
76
|
+
num_hidden_layers=12,
|
|
77
|
+
num_attention_heads=8,
|
|
78
|
+
max_position_embeddings=77,
|
|
79
|
+
hidden_act="quick_gelu",
|
|
80
|
+
layer_norm_eps=1e-5,
|
|
81
|
+
attention_dropout=0.0,
|
|
82
|
+
initializer_range=0.02,
|
|
83
|
+
initializer_factor=1.0,
|
|
84
|
+
# This differs from `CLIPTokenizer`'s default and from openai/clip
|
|
85
|
+
# See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
|
|
86
|
+
pad_token_id=1,
|
|
87
|
+
bos_token_id=49406,
|
|
88
|
+
eos_token_id=49407,
|
|
89
|
+
**kwargs,
|
|
90
|
+
):
|
|
91
|
+
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
|
92
|
+
|
|
93
|
+
self.vocab_size = vocab_size
|
|
94
|
+
self.hidden_size = hidden_size
|
|
95
|
+
self.intermediate_size = intermediate_size
|
|
96
|
+
self.projection_dim = projection_dim
|
|
97
|
+
self.num_hidden_layers = num_hidden_layers
|
|
98
|
+
self.num_attention_heads = num_attention_heads
|
|
99
|
+
self.max_position_embeddings = max_position_embeddings
|
|
100
|
+
self.layer_norm_eps = layer_norm_eps
|
|
101
|
+
self.hidden_act = hidden_act
|
|
102
|
+
self.initializer_range = initializer_range
|
|
103
|
+
self.initializer_factor = initializer_factor
|
|
104
|
+
self.attention_dropout = attention_dropout
|
|
105
|
+
self.add_time_attn = False ######################################
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
109
|
+
cls._set_token_in_kwargs(kwargs)
|
|
110
|
+
|
|
111
|
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
112
|
+
|
|
113
|
+
# get the text config dict if we are loading from CLIPConfig
|
|
114
|
+
if config_dict.get("model_type") == "clip":
|
|
115
|
+
config_dict = config_dict["text_config"]
|
|
116
|
+
|
|
117
|
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
118
|
+
logger.warning(
|
|
119
|
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
120
|
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return cls.from_dict(config_dict, **kwargs)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class CLIPVisionConfig(PretrainedConfig):
|
|
129
|
+
r"""
|
|
130
|
+
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
|
|
131
|
+
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
|
132
|
+
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
|
|
133
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
134
|
+
|
|
135
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
136
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
hidden_size (`int`, *optional*, defaults to 768):
|
|
140
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
141
|
+
intermediate_size (`int`, *optional*, defaults to 3072):
|
|
142
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
143
|
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
|
144
|
+
Number of hidden layers in the Transformer encoder.
|
|
145
|
+
num_attention_heads (`int`, *optional*, defaults to 12):
|
|
146
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
147
|
+
image_size (`int`, *optional*, defaults to 224):
|
|
148
|
+
The size (resolution) of each image.
|
|
149
|
+
patch_size (`int`, *optional*, defaults to 32):
|
|
150
|
+
The size (resolution) of each patch.
|
|
151
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
|
152
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
153
|
+
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
|
154
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
|
155
|
+
The epsilon used by the layer normalization layers.
|
|
156
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
157
|
+
The dropout ratio for the attention probabilities.
|
|
158
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
159
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
160
|
+
initializer_factor (`float`, *optional*, defaults to 1):
|
|
161
|
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
|
162
|
+
testing).
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
|
|
168
|
+
|
|
169
|
+
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
|
|
170
|
+
>>> configuration = CLIPVisionConfig()
|
|
171
|
+
|
|
172
|
+
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
173
|
+
>>> model = CLIPVisionModel(configuration)
|
|
174
|
+
|
|
175
|
+
>>> # Accessing the model configuration
|
|
176
|
+
>>> configuration = model.config
|
|
177
|
+
```"""
|
|
178
|
+
|
|
179
|
+
model_type = "clip_vision_model"
|
|
180
|
+
|
|
181
|
+
def __init__(
|
|
182
|
+
self,
|
|
183
|
+
hidden_size=768,
|
|
184
|
+
intermediate_size=3072,
|
|
185
|
+
projection_dim=512,
|
|
186
|
+
num_hidden_layers=12,
|
|
187
|
+
num_attention_heads=12,
|
|
188
|
+
num_channels=3,
|
|
189
|
+
image_size=224,
|
|
190
|
+
patch_size=32,
|
|
191
|
+
hidden_act="quick_gelu",
|
|
192
|
+
layer_norm_eps=1e-5,
|
|
193
|
+
attention_dropout=0.0,
|
|
194
|
+
initializer_range=0.02,
|
|
195
|
+
initializer_factor=1.0,
|
|
196
|
+
|
|
197
|
+
add_time_attn=False, ################################
|
|
198
|
+
num_frames=1, ################################
|
|
199
|
+
force_patch_dropout=0.0, ################################
|
|
200
|
+
lora_r=2, ################################
|
|
201
|
+
lora_alpha=16, ################################
|
|
202
|
+
lora_dropout=0.0, ################################
|
|
203
|
+
num_mel_bins=0.0, ################################
|
|
204
|
+
target_length=0.0, ################################
|
|
205
|
+
video_decode_backend='decord', #########################
|
|
206
|
+
**kwargs,
|
|
207
|
+
):
|
|
208
|
+
super().__init__(**kwargs)
|
|
209
|
+
|
|
210
|
+
self.hidden_size = hidden_size
|
|
211
|
+
self.intermediate_size = intermediate_size
|
|
212
|
+
self.projection_dim = projection_dim
|
|
213
|
+
self.num_hidden_layers = num_hidden_layers
|
|
214
|
+
self.num_attention_heads = num_attention_heads
|
|
215
|
+
self.num_channels = num_channels
|
|
216
|
+
self.patch_size = patch_size
|
|
217
|
+
self.image_size = image_size
|
|
218
|
+
self.initializer_range = initializer_range
|
|
219
|
+
self.initializer_factor = initializer_factor
|
|
220
|
+
self.attention_dropout = attention_dropout
|
|
221
|
+
self.layer_norm_eps = layer_norm_eps
|
|
222
|
+
self.hidden_act = hidden_act
|
|
223
|
+
|
|
224
|
+
self.add_time_attn = add_time_attn ################
|
|
225
|
+
self.num_frames = num_frames ################
|
|
226
|
+
self.force_patch_dropout = force_patch_dropout ################
|
|
227
|
+
self.lora_r = lora_r ################
|
|
228
|
+
self.lora_alpha = lora_alpha ################
|
|
229
|
+
self.lora_dropout = lora_dropout ################
|
|
230
|
+
self.num_mel_bins = num_mel_bins ################
|
|
231
|
+
self.target_length = target_length ################
|
|
232
|
+
self.video_decode_backend = video_decode_backend ################
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
236
|
+
cls._set_token_in_kwargs(kwargs)
|
|
237
|
+
|
|
238
|
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
239
|
+
|
|
240
|
+
# get the vision config dict if we are loading from CLIPConfig
|
|
241
|
+
if config_dict.get("model_type") == "clip":
|
|
242
|
+
config_dict = config_dict["vision_config"]
|
|
243
|
+
|
|
244
|
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
245
|
+
logger.warning(
|
|
246
|
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
247
|
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return cls.from_dict(config_dict, **kwargs)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class LanguageBindImageConfig(PretrainedConfig):
|
|
254
|
+
r"""
|
|
255
|
+
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
|
|
256
|
+
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
|
|
257
|
+
a configuration with the defaults will yield a similar configuration to that of the CLIP
|
|
258
|
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
|
259
|
+
|
|
260
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
261
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
text_config (`dict`, *optional*):
|
|
265
|
+
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
|
|
266
|
+
vision_config (`dict`, *optional*):
|
|
267
|
+
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
|
268
|
+
projection_dim (`int`, *optional*, defaults to 512):
|
|
269
|
+
Dimentionality of text and vision projection layers.
|
|
270
|
+
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
|
271
|
+
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
|
|
272
|
+
kwargs (*optional*):
|
|
273
|
+
Dictionary of keyword arguments.
|
|
274
|
+
|
|
275
|
+
Example:
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
>>> from transformers import CLIPConfig, CLIPModel
|
|
279
|
+
|
|
280
|
+
>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
|
|
281
|
+
>>> configuration = CLIPConfig()
|
|
282
|
+
|
|
283
|
+
>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
|
284
|
+
>>> model = CLIPModel(configuration)
|
|
285
|
+
|
|
286
|
+
>>> # Accessing the model configuration
|
|
287
|
+
>>> configuration = model.config
|
|
288
|
+
|
|
289
|
+
>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
|
|
290
|
+
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
|
|
291
|
+
|
|
292
|
+
>>> # Initializing a CLIPText and CLIPVision configuration
|
|
293
|
+
>>> config_text = CLIPTextConfig()
|
|
294
|
+
>>> config_vision = CLIPVisionConfig()
|
|
295
|
+
|
|
296
|
+
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
|
|
297
|
+
```"""
|
|
298
|
+
|
|
299
|
+
model_type = "LanguageBindImage"
|
|
300
|
+
is_composition = True
|
|
301
|
+
|
|
302
|
+
def __init__(
|
|
303
|
+
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
|
304
|
+
):
|
|
305
|
+
# If `_config_dict` exist, we use them for the backward compatibility.
|
|
306
|
+
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
|
|
307
|
+
# of confusion!).
|
|
308
|
+
text_config_dict = kwargs.pop("text_config_dict", None)
|
|
309
|
+
vision_config_dict = kwargs.pop("vision_config_dict", None)
|
|
310
|
+
|
|
311
|
+
super().__init__(**kwargs)
|
|
312
|
+
|
|
313
|
+
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
|
|
314
|
+
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
|
|
315
|
+
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
|
|
316
|
+
if text_config_dict is not None:
|
|
317
|
+
if text_config is None:
|
|
318
|
+
text_config = {}
|
|
319
|
+
|
|
320
|
+
# This is the complete result when using `text_config_dict`.
|
|
321
|
+
_text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
|
|
322
|
+
|
|
323
|
+
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
|
|
324
|
+
for key, value in _text_config_dict.items():
|
|
325
|
+
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
|
|
326
|
+
# If specified in `text_config_dict`
|
|
327
|
+
if key in text_config_dict:
|
|
328
|
+
message = (
|
|
329
|
+
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
|
|
330
|
+
f'The value `text_config_dict["{key}"]` will be used instead.'
|
|
331
|
+
)
|
|
332
|
+
# If inferred from default argument values (just to be super careful)
|
|
333
|
+
else:
|
|
334
|
+
message = (
|
|
335
|
+
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
|
336
|
+
f'value `text_config["{key}"]` will be overriden.'
|
|
337
|
+
)
|
|
338
|
+
logger.warning(message)
|
|
339
|
+
|
|
340
|
+
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
|
341
|
+
text_config.update(_text_config_dict)
|
|
342
|
+
|
|
343
|
+
if vision_config_dict is not None:
|
|
344
|
+
if vision_config is None:
|
|
345
|
+
vision_config = {}
|
|
346
|
+
|
|
347
|
+
# This is the complete result when using `vision_config_dict`.
|
|
348
|
+
_vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
|
|
349
|
+
# convert keys to string instead of integer
|
|
350
|
+
if "id2label" in _vision_config_dict:
|
|
351
|
+
_vision_config_dict["id2label"] = {
|
|
352
|
+
str(key): value for key, value in _vision_config_dict["id2label"].items()
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
|
|
356
|
+
for key, value in _vision_config_dict.items():
|
|
357
|
+
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
|
|
358
|
+
# If specified in `vision_config_dict`
|
|
359
|
+
if key in vision_config_dict:
|
|
360
|
+
message = (
|
|
361
|
+
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
|
|
362
|
+
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
|
|
363
|
+
)
|
|
364
|
+
# If inferred from default argument values (just to be super careful)
|
|
365
|
+
else:
|
|
366
|
+
message = (
|
|
367
|
+
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
|
368
|
+
f'The value `vision_config["{key}"]` will be overriden.'
|
|
369
|
+
)
|
|
370
|
+
logger.warning(message)
|
|
371
|
+
|
|
372
|
+
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
|
373
|
+
vision_config.update(_vision_config_dict)
|
|
374
|
+
|
|
375
|
+
if text_config is None:
|
|
376
|
+
text_config = {}
|
|
377
|
+
logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
|
|
378
|
+
|
|
379
|
+
if vision_config is None:
|
|
380
|
+
vision_config = {}
|
|
381
|
+
logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
|
|
382
|
+
|
|
383
|
+
self.text_config = CLIPTextConfig(**text_config)
|
|
384
|
+
self.vision_config = CLIPVisionConfig(**vision_config)
|
|
385
|
+
|
|
386
|
+
self.projection_dim = projection_dim
|
|
387
|
+
self.logit_scale_init_value = logit_scale_init_value
|
|
388
|
+
self.initializer_factor = 1.0
|
|
389
|
+
|
|
390
|
+
@classmethod
|
|
391
|
+
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
|
|
392
|
+
r"""
|
|
393
|
+
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
|
|
394
|
+
configuration.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
[`CLIPConfig`]: An instance of a configuration object
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
|
401
|
+
|
|
402
|
+
def to_dict(self):
|
|
403
|
+
"""
|
|
404
|
+
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
|
408
|
+
"""
|
|
409
|
+
output = copy.deepcopy(self.__dict__)
|
|
410
|
+
output["text_config"] = self.text_config.to_dict()
|
|
411
|
+
output["vision_config"] = self.vision_config.to_dict()
|
|
412
|
+
output["model_type"] = self.__class__.model_type
|
|
413
|
+
return output
|