languagebind 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ import cv2
2
+ import torch
3
+ from PIL import Image
4
+ from torch import nn
5
+ from torchvision import transforms
6
+ from transformers import ProcessorMixin, BatchEncoding
7
+ from transformers.image_processing_utils import BatchFeature
8
+
9
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
10
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
11
+
12
+ def make_list_of_images(x):
13
+ if not isinstance(x, list):
14
+ return [x]
15
+ return x
16
+
17
+ def opencv_loader(path):
18
+ return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
19
+
20
+
21
+ class DepthNorm(nn.Module):
22
+ def __init__(
23
+ self,
24
+ max_depth=0,
25
+ min_depth=0.01,
26
+ ):
27
+ super().__init__()
28
+ self.max_depth = max_depth
29
+ self.min_depth = min_depth
30
+ self.scale = 1000.0 # nyuv2 abs.depth
31
+
32
+ def forward(self, image):
33
+ # image = np.array(image)
34
+ depth_img = image / self.scale # (H, W) in meters
35
+ depth_img = depth_img.clip(min=self.min_depth)
36
+ if self.max_depth != 0:
37
+ depth_img = depth_img.clip(max=self.max_depth)
38
+ depth_img /= self.max_depth # 0-1
39
+ else:
40
+ depth_img /= depth_img.max()
41
+ depth_img = torch.from_numpy(depth_img).unsqueeze(0).repeat(3, 1, 1) # assume image
42
+ return depth_img.to(torch.get_default_dtype())
43
+
44
+ def get_depth_transform(config):
45
+ config = config.vision_config
46
+ transform = transforms.Compose(
47
+ [
48
+ DepthNorm(max_depth=config.max_depth),
49
+ transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
50
+ transforms.CenterCrop(224),
51
+ transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD), # assume image
52
+ # transforms.Normalize((0.5, ), (0.5, )) # 0-1 to norm distribution
53
+ # transforms.Normalize((0.0418, ), (0.0295, )) # sun rgb-d imagebind
54
+ # transforms.Normalize((0.02, ), (0.00295, )) # nyuv2
55
+ ]
56
+ )
57
+ return transform
58
+
59
+ def load_and_transform_depth(depth_path, transform):
60
+ depth = opencv_loader(depth_path)
61
+ depth_outputs = transform(depth)
62
+ return depth_outputs
63
+
64
+ class LanguageBindDepthProcessor(ProcessorMixin):
65
+ attributes = []
66
+ tokenizer_class = ("LanguageBindDepthTokenizer")
67
+
68
+ def __init__(self, config, tokenizer=None, **kwargs):
69
+ super().__init__(**kwargs)
70
+ self.config = config
71
+ self.transform = get_depth_transform(config)
72
+ self.image_processor = load_and_transform_depth
73
+ self.tokenizer = tokenizer
74
+
75
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
76
+ if text is None and images is None:
77
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
78
+
79
+ if text is not None:
80
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
81
+ truncation=True, return_tensors=return_tensors, **kwargs)
82
+
83
+ if images is not None:
84
+ images = make_list_of_images(images)
85
+ image_features = [self.image_processor(image, self.transform) for image in images]
86
+ image_features = torch.stack(image_features)
87
+
88
+ if text is not None and images is not None:
89
+ encoding["pixel_values"] = image_features
90
+ return encoding
91
+ elif text is not None:
92
+ return encoding
93
+ else:
94
+ return {"pixel_values": image_features}
95
+
96
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
97
+ """
98
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
99
+ refer to the docstring of this method for more information.
100
+ """
101
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
102
+
103
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
104
+ """
105
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
106
+ the docstring of this method for more information.
107
+ """
108
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
@@ -0,0 +1,78 @@
1
+ from transformers import CLIPTokenizer
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ VOCAB_FILES_NAMES = {
7
+ "vocab_file": "vocab.json",
8
+ "merges_file": "merges.txt",
9
+ }
10
+
11
+ PRETRAINED_VOCAB_FILES_MAP = {
12
+ "vocab_file": {
13
+ "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/vocab.json",
14
+ },
15
+ "merges_file": {
16
+ "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/merges.txt",
17
+ },
18
+ }
19
+
20
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21
+ "lb203/LanguageBind-Depth": 77,
22
+ }
23
+
24
+
25
+ PRETRAINED_INIT_CONFIGURATION = {
26
+ "lb203/LanguageBind-Thermal": {},
27
+ }
28
+
29
+ class LanguageBindDepthTokenizer(CLIPTokenizer):
30
+ """
31
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32
+
33
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34
+ this superclass for more information regarding those methods.
35
+
36
+ Args:
37
+ vocab_file (`str`):
38
+ Path to the vocabulary file.
39
+ merges_file (`str`):
40
+ Path to the merges file.
41
+ errors (`str`, *optional*, defaults to `"replace"`):
42
+ Paradigm to follow when decoding bytes to UTF-8. See
43
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46
+ token instead.
47
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48
+ The beginning of sequence token.
49
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50
+ The end of sequence token.
51
+ """
52
+
53
+ vocab_files_names = VOCAB_FILES_NAMES
54
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56
+ model_input_names = ["input_ids", "attention_mask"]
57
+
58
+ def __init__(
59
+ self,
60
+ vocab_file,
61
+ merges_file,
62
+ errors="replace",
63
+ unk_token="<|endoftext|>",
64
+ bos_token="<|startoftext|>",
65
+ eos_token="<|endoftext|>",
66
+ pad_token="<|endoftext|>", # hack to enable padding
67
+ **kwargs,
68
+ ):
69
+ super(LanguageBindDepthTokenizer, self).__init__(
70
+ vocab_file=vocab_file,
71
+ merges_file=merges_file,
72
+ errors=errors,
73
+ unk_token=unk_token,
74
+ bos_token=bos_token,
75
+ eos_token=eos_token,
76
+ pad_token=pad_token,
77
+ **kwargs,
78
+ )
File without changes
@@ -0,0 +1,413 @@
1
+ import copy
2
+ import os
3
+ from typing import Union
4
+
5
+ from transformers import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+ class CLIPTextConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
19
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
20
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
21
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
22
+
23
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
24
+ documentation from [`PretrainedConfig`] for more information.
25
+
26
+ Args:
27
+ vocab_size (`int`, *optional*, defaults to 49408):
28
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
29
+ the `inputs_ids` passed when calling [`CLIPModel`].
30
+ hidden_size (`int`, *optional*, defaults to 512):
31
+ Dimensionality of the encoder layers and the pooler layer.
32
+ intermediate_size (`int`, *optional*, defaults to 2048):
33
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
34
+ num_hidden_layers (`int`, *optional*, defaults to 12):
35
+ Number of hidden layers in the Transformer encoder.
36
+ num_attention_heads (`int`, *optional*, defaults to 8):
37
+ Number of attention heads for each attention layer in the Transformer encoder.
38
+ max_position_embeddings (`int`, *optional*, defaults to 77):
39
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
40
+ just in case (e.g., 512 or 1024 or 2048).
41
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
42
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
43
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
44
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
45
+ The epsilon used by the layer normalization layers.
46
+ attention_dropout (`float`, *optional*, defaults to 0.0):
47
+ The dropout ratio for the attention probabilities.
48
+ initializer_range (`float`, *optional*, defaults to 0.02):
49
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50
+ initializer_factor (`float`, *optional*, defaults to 1):
51
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
52
+ testing).
53
+
54
+ Example:
55
+
56
+ ```python
57
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
58
+
59
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
60
+ >>> configuration = CLIPTextConfig()
61
+
62
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
63
+ >>> model = CLIPTextModel(configuration)
64
+
65
+ >>> # Accessing the model configuration
66
+ >>> configuration = model.config
67
+ ```"""
68
+ model_type = "clip_text_model"
69
+
70
+ def __init__(
71
+ self,
72
+ vocab_size=49408,
73
+ hidden_size=512,
74
+ intermediate_size=2048,
75
+ projection_dim=512,
76
+ num_hidden_layers=12,
77
+ num_attention_heads=8,
78
+ max_position_embeddings=77,
79
+ hidden_act="quick_gelu",
80
+ layer_norm_eps=1e-5,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=1.0,
84
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
85
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
86
+ pad_token_id=1,
87
+ bos_token_id=49406,
88
+ eos_token_id=49407,
89
+ **kwargs,
90
+ ):
91
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
92
+
93
+ self.vocab_size = vocab_size
94
+ self.hidden_size = hidden_size
95
+ self.intermediate_size = intermediate_size
96
+ self.projection_dim = projection_dim
97
+ self.num_hidden_layers = num_hidden_layers
98
+ self.num_attention_heads = num_attention_heads
99
+ self.max_position_embeddings = max_position_embeddings
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.initializer_range = initializer_range
103
+ self.initializer_factor = initializer_factor
104
+ self.attention_dropout = attention_dropout
105
+ self.add_time_attn = False ######################################
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
109
+ cls._set_token_in_kwargs(kwargs)
110
+
111
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
112
+
113
+ # get the text config dict if we are loading from CLIPConfig
114
+ if config_dict.get("model_type") == "clip":
115
+ config_dict = config_dict["text_config"]
116
+
117
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
118
+ logger.warning(
119
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
120
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
121
+ )
122
+
123
+ return cls.from_dict(config_dict, **kwargs)
124
+
125
+
126
+
127
+
128
+ class CLIPVisionConfig(PretrainedConfig):
129
+ r"""
130
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
131
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
132
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
133
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
134
+
135
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
136
+ documentation from [`PretrainedConfig`] for more information.
137
+
138
+ Args:
139
+ hidden_size (`int`, *optional*, defaults to 768):
140
+ Dimensionality of the encoder layers and the pooler layer.
141
+ intermediate_size (`int`, *optional*, defaults to 3072):
142
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
143
+ num_hidden_layers (`int`, *optional*, defaults to 12):
144
+ Number of hidden layers in the Transformer encoder.
145
+ num_attention_heads (`int`, *optional*, defaults to 12):
146
+ Number of attention heads for each attention layer in the Transformer encoder.
147
+ image_size (`int`, *optional*, defaults to 224):
148
+ The size (resolution) of each image.
149
+ patch_size (`int`, *optional*, defaults to 32):
150
+ The size (resolution) of each patch.
151
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
154
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
155
+ The epsilon used by the layer normalization layers.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ initializer_range (`float`, *optional*, defaults to 0.02):
159
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
160
+ initializer_factor (`float`, *optional*, defaults to 1):
161
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
162
+ testing).
163
+
164
+ Example:
165
+
166
+ ```python
167
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
168
+
169
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
170
+ >>> configuration = CLIPVisionConfig()
171
+
172
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
173
+ >>> model = CLIPVisionModel(configuration)
174
+
175
+ >>> # Accessing the model configuration
176
+ >>> configuration = model.config
177
+ ```"""
178
+
179
+ model_type = "clip_vision_model"
180
+
181
+ def __init__(
182
+ self,
183
+ hidden_size=768,
184
+ intermediate_size=3072,
185
+ projection_dim=512,
186
+ num_hidden_layers=12,
187
+ num_attention_heads=12,
188
+ num_channels=3,
189
+ image_size=224,
190
+ patch_size=32,
191
+ hidden_act="quick_gelu",
192
+ layer_norm_eps=1e-5,
193
+ attention_dropout=0.0,
194
+ initializer_range=0.02,
195
+ initializer_factor=1.0,
196
+
197
+ add_time_attn=False, ################################
198
+ num_frames=1, ################################
199
+ force_patch_dropout=0.0, ################################
200
+ lora_r=2, ################################
201
+ lora_alpha=16, ################################
202
+ lora_dropout=0.0, ################################
203
+ num_mel_bins=0.0, ################################
204
+ target_length=0.0, ################################
205
+ video_decode_backend='decord', #########################
206
+ **kwargs,
207
+ ):
208
+ super().__init__(**kwargs)
209
+
210
+ self.hidden_size = hidden_size
211
+ self.intermediate_size = intermediate_size
212
+ self.projection_dim = projection_dim
213
+ self.num_hidden_layers = num_hidden_layers
214
+ self.num_attention_heads = num_attention_heads
215
+ self.num_channels = num_channels
216
+ self.patch_size = patch_size
217
+ self.image_size = image_size
218
+ self.initializer_range = initializer_range
219
+ self.initializer_factor = initializer_factor
220
+ self.attention_dropout = attention_dropout
221
+ self.layer_norm_eps = layer_norm_eps
222
+ self.hidden_act = hidden_act
223
+
224
+ self.add_time_attn = add_time_attn ################
225
+ self.num_frames = num_frames ################
226
+ self.force_patch_dropout = force_patch_dropout ################
227
+ self.lora_r = lora_r ################
228
+ self.lora_alpha = lora_alpha ################
229
+ self.lora_dropout = lora_dropout ################
230
+ self.num_mel_bins = num_mel_bins ################
231
+ self.target_length = target_length ################
232
+ self.video_decode_backend = video_decode_backend ################
233
+
234
+ @classmethod
235
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
236
+ cls._set_token_in_kwargs(kwargs)
237
+
238
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
239
+
240
+ # get the vision config dict if we are loading from CLIPConfig
241
+ if config_dict.get("model_type") == "clip":
242
+ config_dict = config_dict["vision_config"]
243
+
244
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
245
+ logger.warning(
246
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
247
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
248
+ )
249
+
250
+ return cls.from_dict(config_dict, **kwargs)
251
+
252
+
253
+ class LanguageBindImageConfig(PretrainedConfig):
254
+ r"""
255
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
256
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
257
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
258
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
259
+
260
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
261
+ documentation from [`PretrainedConfig`] for more information.
262
+
263
+ Args:
264
+ text_config (`dict`, *optional*):
265
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
266
+ vision_config (`dict`, *optional*):
267
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
268
+ projection_dim (`int`, *optional*, defaults to 512):
269
+ Dimentionality of text and vision projection layers.
270
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
271
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
272
+ kwargs (*optional*):
273
+ Dictionary of keyword arguments.
274
+
275
+ Example:
276
+
277
+ ```python
278
+ >>> from transformers import CLIPConfig, CLIPModel
279
+
280
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
281
+ >>> configuration = CLIPConfig()
282
+
283
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
284
+ >>> model = CLIPModel(configuration)
285
+
286
+ >>> # Accessing the model configuration
287
+ >>> configuration = model.config
288
+
289
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
290
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
291
+
292
+ >>> # Initializing a CLIPText and CLIPVision configuration
293
+ >>> config_text = CLIPTextConfig()
294
+ >>> config_vision = CLIPVisionConfig()
295
+
296
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
297
+ ```"""
298
+
299
+ model_type = "LanguageBindImage"
300
+ is_composition = True
301
+
302
+ def __init__(
303
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
304
+ ):
305
+ # If `_config_dict` exist, we use them for the backward compatibility.
306
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
307
+ # of confusion!).
308
+ text_config_dict = kwargs.pop("text_config_dict", None)
309
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
310
+
311
+ super().__init__(**kwargs)
312
+
313
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
314
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
315
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
316
+ if text_config_dict is not None:
317
+ if text_config is None:
318
+ text_config = {}
319
+
320
+ # This is the complete result when using `text_config_dict`.
321
+ _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
322
+
323
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
324
+ for key, value in _text_config_dict.items():
325
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
326
+ # If specified in `text_config_dict`
327
+ if key in text_config_dict:
328
+ message = (
329
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
330
+ f'The value `text_config_dict["{key}"]` will be used instead.'
331
+ )
332
+ # If inferred from default argument values (just to be super careful)
333
+ else:
334
+ message = (
335
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
336
+ f'value `text_config["{key}"]` will be overriden.'
337
+ )
338
+ logger.warning(message)
339
+
340
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
341
+ text_config.update(_text_config_dict)
342
+
343
+ if vision_config_dict is not None:
344
+ if vision_config is None:
345
+ vision_config = {}
346
+
347
+ # This is the complete result when using `vision_config_dict`.
348
+ _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
349
+ # convert keys to string instead of integer
350
+ if "id2label" in _vision_config_dict:
351
+ _vision_config_dict["id2label"] = {
352
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
353
+ }
354
+
355
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
356
+ for key, value in _vision_config_dict.items():
357
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
358
+ # If specified in `vision_config_dict`
359
+ if key in vision_config_dict:
360
+ message = (
361
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
362
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
363
+ )
364
+ # If inferred from default argument values (just to be super careful)
365
+ else:
366
+ message = (
367
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
368
+ f'The value `vision_config["{key}"]` will be overriden.'
369
+ )
370
+ logger.warning(message)
371
+
372
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
373
+ vision_config.update(_vision_config_dict)
374
+
375
+ if text_config is None:
376
+ text_config = {}
377
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
378
+
379
+ if vision_config is None:
380
+ vision_config = {}
381
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
382
+
383
+ self.text_config = CLIPTextConfig(**text_config)
384
+ self.vision_config = CLIPVisionConfig(**vision_config)
385
+
386
+ self.projection_dim = projection_dim
387
+ self.logit_scale_init_value = logit_scale_init_value
388
+ self.initializer_factor = 1.0
389
+
390
+ @classmethod
391
+ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
392
+ r"""
393
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
394
+ configuration.
395
+
396
+ Returns:
397
+ [`CLIPConfig`]: An instance of a configuration object
398
+ """
399
+
400
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
401
+
402
+ def to_dict(self):
403
+ """
404
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
405
+
406
+ Returns:
407
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
408
+ """
409
+ output = copy.deepcopy(self.__dict__)
410
+ output["text_config"] = self.text_config.to_dict()
411
+ output["vision_config"] = self.vision_config.to_dict()
412
+ output["model_type"] = self.__class__.model_type
413
+ return output