languagebind 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ import cv2
2
+ import decord
3
+ import numpy as np
4
+ import torch
5
+ from PIL import Image
6
+ from decord import VideoReader, cpu
7
+ from torchvision import transforms
8
+ from transformers import ProcessorMixin, BatchEncoding
9
+ from transformers.image_processing_utils import BatchFeature
10
+ from pytorchvideo.data.encoded_video import EncodedVideo
11
+ from torchvision.transforms import Compose, Lambda, ToTensor
12
+ try:
13
+ from torchvision.transforms._transforms_video import (
14
+ NormalizeVideo,
15
+ RandomCropVideo,
16
+ RandomHorizontalFlipVideo,
17
+ CenterCropVideo,
18
+ )
19
+ except ImportError:
20
+ from torchvision.transforms import (
21
+ Normalize as NormalizeVideo,
22
+ RandomCrop as RandomCropVideo,
23
+ RandomHorizontalFlip as RandomHorizontalFlipVideo,
24
+ CenterCrop as CenterCropVideo,
25
+ )
26
+ from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
27
+
28
+ decord.bridge.set_bridge('torch')
29
+
30
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
31
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
32
+
33
+ def make_list_of_images(x):
34
+ if not isinstance(x, list):
35
+ return [x]
36
+ return x
37
+
38
+ def get_video_transform(config):
39
+ config = config.vision_config
40
+ if config.video_decode_backend == 'pytorchvideo':
41
+ transform = ApplyTransformToKey(
42
+ key="video",
43
+ transform=Compose(
44
+ [
45
+ UniformTemporalSubsample(config.num_frames),
46
+ Lambda(lambda x: x / 255.0),
47
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
48
+ ShortSideScale(size=224),
49
+ CenterCropVideo(224),
50
+ RandomHorizontalFlipVideo(p=0.5),
51
+ ]
52
+ ),
53
+ )
54
+
55
+ elif config.video_decode_backend == 'decord':
56
+
57
+ transform = Compose(
58
+ [
59
+ # UniformTemporalSubsample(num_frames),
60
+ Lambda(lambda x: x / 255.0),
61
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
62
+ ShortSideScale(size=224),
63
+ CenterCropVideo(224),
64
+ RandomHorizontalFlipVideo(p=0.5),
65
+ ]
66
+ )
67
+
68
+ elif config.video_decode_backend == 'opencv':
69
+ transform = Compose(
70
+ [
71
+ # UniformTemporalSubsample(num_frames),
72
+ Lambda(lambda x: x / 255.0),
73
+ NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
74
+ ShortSideScale(size=224),
75
+ CenterCropVideo(224),
76
+ RandomHorizontalFlipVideo(p=0.5),
77
+ ]
78
+ )
79
+ else:
80
+ raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
81
+ return transform
82
+
83
+
84
+ def load_and_transform_video(
85
+ video_path,
86
+ transform,
87
+ video_decode_backend='opencv',
88
+ clip_start_sec=0.0,
89
+ clip_end_sec=None,
90
+ num_frames=8,
91
+ ):
92
+ if video_decode_backend == 'pytorchvideo':
93
+ # decord pyav
94
+ video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
95
+ duration = video.duration
96
+ start_sec = clip_start_sec # secs
97
+ end_sec = clip_end_sec if clip_end_sec is not None else duration # secs
98
+ video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
99
+ video_outputs = transform(video_data)
100
+
101
+ elif video_decode_backend == 'decord':
102
+ decord.bridge.set_bridge('torch')
103
+ decord_vr = VideoReader(video_path, ctx=cpu(0))
104
+ duration = len(decord_vr)
105
+ frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
106
+ video_data = decord_vr.get_batch(frame_id_list)
107
+ video_data = video_data.permute(3, 0, 1, 2) # (T, H, W, C) -> (C, T, H, W)
108
+ video_outputs = transform(video_data)
109
+
110
+ elif video_decode_backend == 'opencv':
111
+ cv2_vr = cv2.VideoCapture(video_path)
112
+ duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
113
+ frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
114
+
115
+ video_data = []
116
+ for frame_idx in frame_id_list:
117
+ cv2_vr.set(1, frame_idx)
118
+ _, frame = cv2_vr.read()
119
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
120
+ video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
121
+ cv2_vr.release()
122
+ video_data = torch.stack(video_data, dim=1)
123
+ video_outputs = transform(video_data)
124
+ else:
125
+ raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
126
+ return video_outputs
127
+
128
+ class LanguageBindVideoProcessor(ProcessorMixin):
129
+ attributes = []
130
+ tokenizer_class = ("LanguageBindVideoTokenizer")
131
+
132
+ def __init__(self, config, tokenizer=None, **kwargs):
133
+ super().__init__(**kwargs)
134
+ self.config = config
135
+ self.transform = get_video_transform(config)
136
+ self.image_processor = load_and_transform_video
137
+ self.tokenizer = tokenizer
138
+
139
+ def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
140
+ if text is None and images is None:
141
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
142
+
143
+ if text is not None:
144
+ encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
145
+ truncation=True, return_tensors=return_tensors, **kwargs)
146
+
147
+ if images is not None:
148
+ images = make_list_of_images(images)
149
+ image_features = [self.image_processor(image, self.transform,
150
+ video_decode_backend=self.config.vision_config.video_decode_backend,
151
+ num_frames=self.config.vision_config.num_frames) for image in images]
152
+ image_features = torch.stack(image_features)
153
+
154
+ if text is not None and images is not None:
155
+ encoding["pixel_values"] = image_features
156
+ return encoding
157
+ elif text is not None:
158
+ return encoding
159
+ else:
160
+ return {"pixel_values": image_features}
161
+
162
+ def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
163
+ """
164
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
165
+ refer to the docstring of this method for more information.
166
+ """
167
+ return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
168
+
169
+ def decode(self, skip_special_tokens=True, *args, **kwargs):
170
+ """
171
+ This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
172
+ the docstring of this method for more information.
173
+ """
174
+ return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
@@ -0,0 +1,78 @@
1
+ from transformers import CLIPTokenizer
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ VOCAB_FILES_NAMES = {
7
+ "vocab_file": "vocab.json",
8
+ "merges_file": "merges.txt",
9
+ }
10
+
11
+ PRETRAINED_VOCAB_FILES_MAP = {
12
+ "vocab_file": {
13
+ "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
14
+ },
15
+ "merges_file": {
16
+ "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
17
+ },
18
+ }
19
+
20
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21
+ "lb203/LanguageBind-Video": 77,
22
+ }
23
+
24
+
25
+ PRETRAINED_INIT_CONFIGURATION = {
26
+ "lb203/LanguageBind-Video": {},
27
+ }
28
+
29
+ class LanguageBindVideoTokenizer(CLIPTokenizer):
30
+ """
31
+ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32
+
33
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34
+ this superclass for more information regarding those methods.
35
+
36
+ Args:
37
+ vocab_file (`str`):
38
+ Path to the vocabulary file.
39
+ merges_file (`str`):
40
+ Path to the merges file.
41
+ errors (`str`, *optional*, defaults to `"replace"`):
42
+ Paradigm to follow when decoding bytes to UTF-8. See
43
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46
+ token instead.
47
+ bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48
+ The beginning of sequence token.
49
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50
+ The end of sequence token.
51
+ """
52
+
53
+ vocab_files_names = VOCAB_FILES_NAMES
54
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56
+ model_input_names = ["input_ids", "attention_mask"]
57
+
58
+ def __init__(
59
+ self,
60
+ vocab_file,
61
+ merges_file,
62
+ errors="replace",
63
+ unk_token="<|endoftext|>",
64
+ bos_token="<|startoftext|>",
65
+ eos_token="<|endoftext|>",
66
+ pad_token="<|endoftext|>", # hack to enable padding
67
+ **kwargs,
68
+ ):
69
+ super(LanguageBindVideoTokenizer, self).__init__(
70
+ vocab_file=vocab_file,
71
+ merges_file=merges_file,
72
+ errors=errors,
73
+ unk_token=unk_token,
74
+ bos_token=bos_token,
75
+ eos_token=eos_token,
76
+ pad_token=pad_token,
77
+ **kwargs,
78
+ )
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: languagebind
3
+ Version: 0.1.0
4
+ Summary: LanguageBind: multimodal (video/audio/image/depth/thermal) embedding model aligned to text via language-based semantic alignment. Packaged for pip-installability with compatibility patches for modern transformers/torchvision/torchaudio.
5
+ Project-URL: Homepage, https://github.com/PKU-YuanGroup/LanguageBind
6
+ Project-URL: Source, https://github.com/embeddings-benchmark/languagebind
7
+ License: MIT
8
+ Requires-Python: >=3.9
9
+ Requires-Dist: einops>=0.8.0
10
+ Requires-Dist: peft>=0.11.0
11
+ Requires-Dist: torch>=2.0.0
12
+ Requires-Dist: transformers<5.0.0,>=4.40.0
13
+ Provides-Extra: all
14
+ Requires-Dist: decord>=0.6.0; extra == 'all'
15
+ Requires-Dist: opencv-python-headless>=4.5.0; extra == 'all'
16
+ Requires-Dist: pytorchvideo>=0.1.5; extra == 'all'
17
+ Requires-Dist: soundfile>=0.12.0; extra == 'all'
18
+ Requires-Dist: torchaudio>=0.13.0; extra == 'all'
19
+ Requires-Dist: torchvision>=0.15.0; extra == 'all'
20
+ Provides-Extra: audio
21
+ Requires-Dist: soundfile>=0.12.0; extra == 'audio'
22
+ Requires-Dist: torchaudio>=0.13.0; extra == 'audio'
23
+ Provides-Extra: video
24
+ Requires-Dist: decord>=0.6.0; extra == 'video'
25
+ Requires-Dist: opencv-python-headless>=4.5.0; extra == 'video'
26
+ Requires-Dist: pytorchvideo>=0.1.5; extra == 'video'
27
+ Requires-Dist: torchvision>=0.15.0; extra == 'video'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # languagebind
31
+
32
+ [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
33
+
34
+ The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install languagebind
40
+ ```
41
+
42
+ For video support:
43
+ ```bash
44
+ pip install "languagebind[video]"
45
+ ```
46
+
47
+ For audio support:
48
+ ```bash
49
+ pip install "languagebind[audio]"
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ```python
55
+ from languagebind import (
56
+ LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
57
+ LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
58
+ LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
59
+ )
60
+ ```
61
+
62
+ ## Compatibility patches
63
+
64
+ - `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
65
+ - `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
66
+ - `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
67
+ - `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
68
+
69
+ ## License
70
+
71
+ MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).
@@ -0,0 +1,30 @@
1
+ languagebind/__init__.py,sha256=ileLB3nDuwJiZsYwlk6sA61UlkF7MZNhixF3Obqj2Mk,3481
2
+ languagebind/_compat.py,sha256=ty1dbhW2HLn0Wz7IhVSQjsQOz3v-sgSj3X2QhvCFcCQ,1108
3
+ languagebind/audio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ languagebind/audio/configuration_audio.py,sha256=YAgj-1zKL3a_Egh5cl-QETAnmlkSgOS9yQ4uMYWZtdg,19867
5
+ languagebind/audio/modeling_audio.py,sha256=1o9u3u-qS0VWV2ArlFTJ8ElTLRAuw7F1j6wOwp4ylJQ,44643
6
+ languagebind/audio/processing_audio.py,sha256=5ZVxRm7njmbOxGpulrnjA0-YnRUA6GMKbLdiOLlIfkc,6868
7
+ languagebind/audio/tokenization_audio.py,sha256=Fp-oV3ZLJdzh8hPKj24wWZSq4-x0K2AG0PQ9YItIn_4,2695
8
+ languagebind/depth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ languagebind/depth/configuration_depth.py,sha256=NmS00Q1FnFjcJSqNHXFEb3lY1P-V4Jt-WBNEHcFmCi0,19738
10
+ languagebind/depth/modeling_depth.py,sha256=KmX5ZAaRMpJPk5qbJ8s_vgG2yXNBJWv7hoZ9nVkGqsw,44643
11
+ languagebind/depth/processing_depth.py,sha256=AQkHjBc8R4HbwPVW44_MlTLtqg1i_i2VmJ5fHuRcRNY,4196
12
+ languagebind/depth/tokenization_depth.py,sha256=cqOtIoaHA5SvoJSqIyw2-FRalqDpquL0ArLunXMwaKM,2697
13
+ languagebind/image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ languagebind/image/configuration_image.py,sha256=8wzGzx2uwfE_kDraB6SpQaoBxp2YfXIhhcWE8GQqGN0,19663
15
+ languagebind/image/modeling_image.py,sha256=P34Pk_7iSTzx7_zHD_G9KluJaYVkTTA9ClWOS0DM2-M,44643
16
+ languagebind/image/processing_image.py,sha256=T4MwQas2kTRNCGyXaiSOF8Zg5rR1cU7tGlikCigLjPE,3038
17
+ languagebind/image/tokenization_image.py,sha256=FlEhirf08_zzTM63CAfgLVZwiFPVXaIc8vqWqDy875w,2695
18
+ languagebind/thermal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ languagebind/thermal/configuration_thermal.py,sha256=fXj9-Lg_gCmQCZL4vl37UqJZ05ycqKK_qmOBewOtohw,19667
20
+ languagebind/thermal/modeling_thermal.py,sha256=UDFSa2aHfPbhqs8X2oyhR1EH93vbrKc5VWO3LcgYA_0,44663
21
+ languagebind/thermal/processing_thermal.py,sha256=b3TJpbgienD_3saNiUdpRJTJg_Ssbpj0hj1eaz7ZyRM,3062
22
+ languagebind/thermal/tokenization_thermal.py,sha256=2pFI6qk-ze-n3d4pYHlpYiYt82iZTGSpF7aUo6J1IlM,2711
23
+ languagebind/video/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ languagebind/video/configuration_video.py,sha256=59WWWHckY7t16Tux1VuJFiZh9w7M7J4u4v1r2LCDK6Q,19663
25
+ languagebind/video/modeling_video.py,sha256=Z2TO4Vby7U7aXfOL8HyZWchLCHInm6O9nNFwZUgDyZo,49729
26
+ languagebind/video/processing_video.py,sha256=agBG-944DUVQT0TMCdlgKAgbFP5QmFLa7ZSY0-_s0uM,6879
27
+ languagebind/video/tokenization_video.py,sha256=3bYysMslZhSeGeDUt1n11xnTThC8wADsz6DyXd9ibDU,2695
28
+ languagebind-0.1.0.dist-info/METADATA,sha256=0KJoe8AzqkVKJsw4rXC0O-PFlacbJrhDjd7tivxtS4o,2775
29
+ languagebind-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
30
+ languagebind-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any