PyPI - languagebind - Versions diffs - 0.1.0__py3-none-any.whl - Mend

languagebind 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

languagebind/__init__.py +91 -0
languagebind/_compat.py +24 -0
languagebind/audio/__init__.py +0 -0
languagebind/audio/configuration_audio.py +420 -0
languagebind/audio/modeling_audio.py +1031 -0
languagebind/audio/processing_audio.py +174 -0
languagebind/audio/tokenization_audio.py +78 -0
languagebind/depth/__init__.py +0 -0
languagebind/depth/configuration_depth.py +415 -0
languagebind/depth/modeling_depth.py +1031 -0
languagebind/depth/processing_depth.py +108 -0
languagebind/depth/tokenization_depth.py +78 -0
languagebind/image/__init__.py +0 -0
languagebind/image/configuration_image.py +413 -0
languagebind/image/modeling_image.py +1031 -0
languagebind/image/processing_image.py +77 -0
languagebind/image/tokenization_image.py +78 -0
languagebind/thermal/__init__.py +0 -0
languagebind/thermal/configuration_thermal.py +413 -0
languagebind/thermal/modeling_thermal.py +1031 -0
languagebind/thermal/processing_thermal.py +77 -0
languagebind/thermal/tokenization_thermal.py +78 -0
languagebind/video/__init__.py +0 -0
languagebind/video/configuration_video.py +413 -0
languagebind/video/modeling_video.py +1143 -0
languagebind/video/processing_video.py +174 -0
languagebind/video/tokenization_video.py +78 -0
languagebind-0.1.0.dist-info/METADATA +71 -0
languagebind-0.1.0.dist-info/RECORD +30 -0
languagebind-0.1.0.dist-info/WHEEL +4 -0

languagebind/video/processing_video.py ADDED Viewed

@@ -0,0 +1,174 @@
+import cv2
+import decord
+import numpy as np
+import torch
+from PIL import Image
+from decord import VideoReader, cpu
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda, ToTensor
+try:
+    from torchvision.transforms._transforms_video import (
+        NormalizeVideo,
+        RandomCropVideo,
+        RandomHorizontalFlipVideo,
+        CenterCropVideo,
+    )
+except ImportError:
+    from torchvision.transforms import (
+        Normalize as NormalizeVideo,
+        RandomCrop as RandomCropVideo,
+        RandomHorizontalFlip as RandomHorizontalFlipVideo,
+        CenterCrop as CenterCropVideo,
+    )
+from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+decord.bridge.set_bridge('torch')
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+def get_video_transform(config):
+    config = config.vision_config
+    if config.video_decode_backend == 'pytorchvideo':
+        transform = ApplyTransformToKey(
+            key="video",
+            transform=Compose(
+                [
+                    UniformTemporalSubsample(config.num_frames),
+                    Lambda(lambda x: x / 255.0),
+                    NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                    ShortSideScale(size=224),
+                    CenterCropVideo(224),
+                    RandomHorizontalFlipVideo(p=0.5),
+                ]
+            ),
+        )
+    elif config.video_decode_backend == 'decord':
+        transform = Compose(
+            [
+                # UniformTemporalSubsample(num_frames),
+                Lambda(lambda x: x / 255.0),
+                NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                ShortSideScale(size=224),
+                CenterCropVideo(224),
+                RandomHorizontalFlipVideo(p=0.5),
+            ]
+        )
+    elif config.video_decode_backend == 'opencv':
+        transform = Compose(
+            [
+                # UniformTemporalSubsample(num_frames),
+                Lambda(lambda x: x / 255.0),
+                NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                ShortSideScale(size=224),
+                CenterCropVideo(224),
+                RandomHorizontalFlipVideo(p=0.5),
+            ]
+        )
+    else:
+        raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+    return transform
+def load_and_transform_video(
+        video_path,
+        transform,
+        video_decode_backend='opencv',
+        clip_start_sec=0.0,
+        clip_end_sec=None,
+        num_frames=8,
+):
+    if video_decode_backend == 'pytorchvideo':
+        #  decord pyav
+        video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
+        duration = video.duration
+        start_sec = clip_start_sec  # secs
+        end_sec = clip_end_sec if clip_end_sec is not None else duration  # secs
+        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+        video_outputs = transform(video_data)
+    elif video_decode_backend == 'decord':
+        decord.bridge.set_bridge('torch')
+        decord_vr = VideoReader(video_path, ctx=cpu(0))
+        duration = len(decord_vr)
+        frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
+        video_data = decord_vr.get_batch(frame_id_list)
+        video_data = video_data.permute(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
+        video_outputs = transform(video_data)
+    elif video_decode_backend == 'opencv':
+        cv2_vr = cv2.VideoCapture(video_path)
+        duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
+        video_data = []
+        for frame_idx in frame_id_list:
+            cv2_vr.set(1, frame_idx)
+            _, frame = cv2_vr.read()
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
+        cv2_vr.release()
+        video_data = torch.stack(video_data, dim=1)
+        video_outputs = transform(video_data)
+    else:
+        raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+    return video_outputs
+class LanguageBindVideoProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindVideoTokenizer")
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_video_transform(config)
+        self.image_processor = load_and_transform_video
+        self.tokenizer = tokenizer
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform,
+                                                   video_decode_backend=self.config.vision_config.video_decode_backend,
+                                                   num_frames=self.config.vision_config.num_frames) for image in images]
+            image_features = torch.stack(image_features)
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)

languagebind/video/tokenization_video.py ADDED Viewed

@@ -0,0 +1,78 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Video": 77,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Video": {},
+}
+class LanguageBindVideoTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindVideoTokenizer, self).__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )

languagebind-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,71 @@
+Metadata-Version: 2.4
+Name: languagebind
+Version: 0.1.0
+Summary: LanguageBind: multimodal (video/audio/image/depth/thermal) embedding model aligned to text via language-based semantic alignment. Packaged for pip-installability with compatibility patches for modern transformers/torchvision/torchaudio.
+Project-URL: Homepage, https://github.com/PKU-YuanGroup/LanguageBind
+Project-URL: Source, https://github.com/embeddings-benchmark/languagebind
+License: MIT
+Requires-Python: >=3.9
+Requires-Dist: einops>=0.8.0
+Requires-Dist: peft>=0.11.0
+Requires-Dist: torch>=2.0.0
+Requires-Dist: transformers<5.0.0,>=4.40.0
+Provides-Extra: all
+Requires-Dist: decord>=0.6.0; extra == 'all'
+Requires-Dist: opencv-python-headless>=4.5.0; extra == 'all'
+Requires-Dist: pytorchvideo>=0.1.5; extra == 'all'
+Requires-Dist: soundfile>=0.12.0; extra == 'all'
+Requires-Dist: torchaudio>=0.13.0; extra == 'all'
+Requires-Dist: torchvision>=0.15.0; extra == 'all'
+Provides-Extra: audio
+Requires-Dist: soundfile>=0.12.0; extra == 'audio'
+Requires-Dist: torchaudio>=0.13.0; extra == 'audio'
+Provides-Extra: video
+Requires-Dist: decord>=0.6.0; extra == 'video'
+Requires-Dist: opencv-python-headless>=4.5.0; extra == 'video'
+Requires-Dist: pytorchvideo>=0.1.5; extra == 'video'
+Requires-Dist: torchvision>=0.15.0; extra == 'video'
+Description-Content-Type: text/markdown
+# languagebind
+[LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind) (ICLR 2024) packaged as a pip-installable library with compatibility patches for modern `transformers`, `torchvision`, and `torchaudio`.
+The original LanguageBind repo has no `pyproject.toml`, so it cannot be installed via pip. This package provides that, plus inline patches for five breaking changes introduced in newer dependency versions.
+## Installation
+```bash
+pip install languagebind
+```
+For video support:
+```bash
+pip install "languagebind[video]"
+```
+For audio support:
+```bash
+pip install "languagebind[audio]"
+```
+## Usage
+```python
+from languagebind import (
+    LanguageBindVideo, LanguageBindVideoProcessor, LanguageBindVideoTokenizer,
+    LanguageBindAudio, LanguageBindAudioProcessor, LanguageBindAudioTokenizer,
+    LanguageBindImage, LanguageBindImageProcessor, LanguageBindImageTokenizer,
+)
+```
+## Compatibility patches
+- `_expand_mask` / `clip_loss`: removed from `transformers` 4.40+, re-implemented in `languagebind._compat`
+- `torchaudio.set_audio_backend()`: deprecated, guarded with `try/except`
+- `torchvision.transforms._transforms_video`: private API fallback to public `torchvision.transforms` equivalents
+- `CLIPTokenizer.__init__` positional args: changed to keyword args for `transformers` 4.40+ compatibility
+## License
+MIT — same as the original [LanguageBind](https://github.com/PKU-YuanGroup/LanguageBind).

languagebind-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,30 @@
+languagebind/__init__.py,sha256=ileLB3nDuwJiZsYwlk6sA61UlkF7MZNhixF3Obqj2Mk,3481
+languagebind/_compat.py,sha256=ty1dbhW2HLn0Wz7IhVSQjsQOz3v-sgSj3X2QhvCFcCQ,1108
+languagebind/audio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+languagebind/audio/configuration_audio.py,sha256=YAgj-1zKL3a_Egh5cl-QETAnmlkSgOS9yQ4uMYWZtdg,19867
+languagebind/audio/modeling_audio.py,sha256=1o9u3u-qS0VWV2ArlFTJ8ElTLRAuw7F1j6wOwp4ylJQ,44643
+languagebind/audio/processing_audio.py,sha256=5ZVxRm7njmbOxGpulrnjA0-YnRUA6GMKbLdiOLlIfkc,6868
+languagebind/audio/tokenization_audio.py,sha256=Fp-oV3ZLJdzh8hPKj24wWZSq4-x0K2AG0PQ9YItIn_4,2695
+languagebind/depth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+languagebind/depth/configuration_depth.py,sha256=NmS00Q1FnFjcJSqNHXFEb3lY1P-V4Jt-WBNEHcFmCi0,19738
+languagebind/depth/modeling_depth.py,sha256=KmX5ZAaRMpJPk5qbJ8s_vgG2yXNBJWv7hoZ9nVkGqsw,44643
+languagebind/depth/processing_depth.py,sha256=AQkHjBc8R4HbwPVW44_MlTLtqg1i_i2VmJ5fHuRcRNY,4196
+languagebind/depth/tokenization_depth.py,sha256=cqOtIoaHA5SvoJSqIyw2-FRalqDpquL0ArLunXMwaKM,2697
+languagebind/image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+languagebind/image/configuration_image.py,sha256=8wzGzx2uwfE_kDraB6SpQaoBxp2YfXIhhcWE8GQqGN0,19663
+languagebind/image/modeling_image.py,sha256=P34Pk_7iSTzx7_zHD_G9KluJaYVkTTA9ClWOS0DM2-M,44643
+languagebind/image/processing_image.py,sha256=T4MwQas2kTRNCGyXaiSOF8Zg5rR1cU7tGlikCigLjPE,3038
+languagebind/image/tokenization_image.py,sha256=FlEhirf08_zzTM63CAfgLVZwiFPVXaIc8vqWqDy875w,2695
+languagebind/thermal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+languagebind/thermal/configuration_thermal.py,sha256=fXj9-Lg_gCmQCZL4vl37UqJZ05ycqKK_qmOBewOtohw,19667
+languagebind/thermal/modeling_thermal.py,sha256=UDFSa2aHfPbhqs8X2oyhR1EH93vbrKc5VWO3LcgYA_0,44663
+languagebind/thermal/processing_thermal.py,sha256=b3TJpbgienD_3saNiUdpRJTJg_Ssbpj0hj1eaz7ZyRM,3062
+languagebind/thermal/tokenization_thermal.py,sha256=2pFI6qk-ze-n3d4pYHlpYiYt82iZTGSpF7aUo6J1IlM,2711
+languagebind/video/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+languagebind/video/configuration_video.py,sha256=59WWWHckY7t16Tux1VuJFiZh9w7M7J4u4v1r2LCDK6Q,19663
+languagebind/video/modeling_video.py,sha256=Z2TO4Vby7U7aXfOL8HyZWchLCHInm6O9nNFwZUgDyZo,49729
+languagebind/video/processing_video.py,sha256=agBG-944DUVQT0TMCdlgKAgbFP5QmFLa7ZSY0-_s0uM,6879
+languagebind/video/tokenization_video.py,sha256=3bYysMslZhSeGeDUt1n11xnTThC8wADsz6DyXd9ibDU,2695
+languagebind-0.1.0.dist-info/METADATA,sha256=0KJoe8AzqkVKJsw4rXC0O-PFlacbJrhDjd7tivxtS4o,2775
+languagebind-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+languagebind-0.1.0.dist-info/RECORD,,

languagebind-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any