PyPI - birder-clip - Versions diffs - 0.0.2.dev5__tar.gz → 0.0.2.dev7__tar.gz - Mend

birder-clip 0.0.2.dev5tar.gz → 0.0.2.dev7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

birder_clip-0.0.2.dev7/PKG-INFO ADDED Viewed

@@ -0,0 +1,151 @@
+Metadata-Version: 2.4
+Name: birder_clip
+Version: 0.0.2.dev7
+Summary: A Birder extension for CLIP-style image-text modeling and multimodal computer vision workflows.
+Author: Ofer Hasson
+License-Expression: Apache-2.0
+Project-URL: Homepage, https://gitlab.com/birder/birder-clip
+Project-URL: Issues, https://gitlab.com/birder/birder-clip/-/issues
+Project-URL: Changelog, https://gitlab.com/birder/birder-clip/-/blob/main/CHANGELOG.md
+Keywords: computer-vision,clip,image-text,pytorch,deep-learning,artificial intelligence
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Image Recognition
+Classifier: Topic :: Software Development
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Typing :: Typed
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: birder>=0.6.0
+Requires-Dist: ftfy>=6.3.1
+Requires-Dist: regex>=2025.7.29
+Requires-Dist: tqdm>=4.67.0
+Requires-Dist: webdataset>=0.2.111
+Requires-Dist: huggingface_hub
+Requires-Dist: transformers
+Requires-Dist: torch>=2.10.0
+Requires-Dist: torchvision
+Provides-Extra: dev
+Requires-Dist: bandit~=1.9.4; extra == "dev"
+Requires-Dist: black~=26.5.0; extra == "dev"
+Requires-Dist: build~=1.5.0; extra == "dev"
+Requires-Dist: bumpver~=2026.1132; extra == "dev"
+Requires-Dist: coverage~=7.14.2; extra == "dev"
+Requires-Dist: debugpy; extra == "dev"
+Requires-Dist: flake8-pep585~=0.1.7; extra == "dev"
+Requires-Dist: flake8~=7.3.0; extra == "dev"
+Requires-Dist: invoke~=3.0.3; extra == "dev"
+Requires-Dist: ipython; extra == "dev"
+Requires-Dist: isort~=8.0.1; extra == "dev"
+Requires-Dist: mkdocs~=1.6.1; extra == "dev"
+Requires-Dist: mkdocs-exclude~=1.0.2; extra == "dev"
+Requires-Dist: mypy~=2.1.0; extra == "dev"
+Requires-Dist: parameterized~=0.9.0; extra == "dev"
+Requires-Dist: pylint~=4.0.6; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: requests~=2.34.2; extra == "dev"
+Requires-Dist: safetensors~=0.7.0; extra == "dev"
+Requires-Dist: setuptools; extra == "dev"
+Requires-Dist: twine~=6.2.0; extra == "dev"
+Requires-Dist: types-requests~=2.33.0; extra == "dev"
+Requires-Dist: urllib3~=2.7.0; extra == "dev"
+Requires-Dist: wheel; extra == "dev"
+Dynamic: license-file
+# Birder CLIP
+Birder CLIP is an early-stage Birder extension for CLIP-style image-text models, focused on practical inference and fine-tuning workflows.
+- [Introduction](#introduction)
+- [Setup](#setup)
+- [Getting Started](#getting-started)
+- [Training](#training)
+- [Project Status and Contributions](#project-status-and-contributions)
+- [Licenses](#licenses)
+- [Acknowledgments](#acknowledgments)
+## Introduction
+Birder CLIP extends [Birder](https://gitlab.com/birder/birder) with image-text models for zero-shot classification, image-text retrieval style workflows, caption generation and related multimodal computer vision tasks.
+The project is aimed at image-text modeling rather than general vision-language model (VLM) chat or instruction-following systems.
+It currently includes CLIP-style components, tokenizers, model registry utilities, inference scripts and training code.
+Full training is supported, but for large-scale CLIP pretraining you are probably better served by [OpenCLIP](https://github.com/mlfoundations/open_clip).
+## Setup
+1. Ensure your environment meets the minimum requirements:
+   - Python 3.11 or newer
+   - PyTorch 2.10 or newer (installed for your hardware/driver stack)
+   - Birder 0.6.0 or newer
+1. Install the latest Birder CLIP version:
+```sh
+pip install birder-clip
+```
+## Getting Started
+List available image-text models:
+```sh
+python -m birder_clip.tools list-models --image-text
+```
+List available pretrained weights:
+```sh
+python -m birder_clip.tools list-models --pretrained --verbose
+```
+Run zero-shot classification on a directory of images:
+```sh
+python -m birder_clip.scripts.zero_shot -n laion_clip_vit_l14 --classes eagle hawk falcon --template-set default --gpu data/images
+```
+For detailed options, run:
+```sh
+python -m birder_clip.scripts.zero_shot --help
+python -m birder_clip.tools --help
+```
+## Training
+Birder CLIP includes training support for image-text datasets in CSV and WebDataset formats, including CLIP, CoCa and LiT-style workflows.
+## Project Status and Contributions
+Birder CLIP is an early alpha project. APIs, model names, checkpoints, training recipes and command-line options may change without notice.
+This is currently a personal project in active development. Suggestions, bug reports and feedback are welcome through the project's issue tracker, but the project is not yet stable enough for broad external contributions.
+## Licenses
+The code in this project is primarily licensed under Apache 2.0. See [LICENSE](LICENSE) for details.
+Some model implementations, pretrained weights, tokenizers and converted artifacts may be derived from or depend on projects and datasets with their own licenses and usage restrictions.
+**You are responsible for ensuring compliance with all licenses and conditions of any dependent licenses.**
+### Disclaimer
+If you intend to use Birder CLIP, its pretrained weights, or any associated datasets in a commercial product, we strongly recommend seeking legal advice to ensure compliance with all relevant licenses and terms of use.
+It's the user's responsibility to ensure that their use of this project, including any pretrained weights or datasets, complies with all applicable licenses and legal requirements.
+## Acknowledgments
+Birder CLIP owes much to the work of others in computer vision, image-text representation learning and open-source machine learning.
+Special thanks to the [OpenCLIP](https://github.com/mlfoundations/open_clip) project, which serves as the main reference implementation and inspiration for much of the CLIP-style modeling and training work here. The same principle as in Birder applies: this project stands on the shoulders of many open-source projects, papers and datasets. If an attribution is missing, please open an issue.

birder_clip-0.0.2.dev7/README.md ADDED Viewed

@@ -0,0 +1,89 @@
+# Birder CLIP
+Birder CLIP is an early-stage Birder extension for CLIP-style image-text models, focused on practical inference and fine-tuning workflows.
+- [Introduction](#introduction)
+- [Setup](#setup)
+- [Getting Started](#getting-started)
+- [Training](#training)
+- [Project Status and Contributions](#project-status-and-contributions)
+- [Licenses](#licenses)
+- [Acknowledgments](#acknowledgments)
+## Introduction
+Birder CLIP extends [Birder](https://gitlab.com/birder/birder) with image-text models for zero-shot classification, image-text retrieval style workflows, caption generation and related multimodal computer vision tasks.
+The project is aimed at image-text modeling rather than general vision-language model (VLM) chat or instruction-following systems.
+It currently includes CLIP-style components, tokenizers, model registry utilities, inference scripts and training code.
+Full training is supported, but for large-scale CLIP pretraining you are probably better served by [OpenCLIP](https://github.com/mlfoundations/open_clip).
+## Setup
+1. Ensure your environment meets the minimum requirements:
+   - Python 3.11 or newer
+   - PyTorch 2.10 or newer (installed for your hardware/driver stack)
+   - Birder 0.6.0 or newer
+1. Install the latest Birder CLIP version:
+```sh
+pip install birder-clip
+```
+## Getting Started
+List available image-text models:
+```sh
+python -m birder_clip.tools list-models --image-text
+```
+List available pretrained weights:
+```sh
+python -m birder_clip.tools list-models --pretrained --verbose
+```
+Run zero-shot classification on a directory of images:
+```sh
+python -m birder_clip.scripts.zero_shot -n laion_clip_vit_l14 --classes eagle hawk falcon --template-set default --gpu data/images
+```
+For detailed options, run:
+```sh
+python -m birder_clip.scripts.zero_shot --help
+python -m birder_clip.tools --help
+```
+## Training
+Birder CLIP includes training support for image-text datasets in CSV and WebDataset formats, including CLIP, CoCa and LiT-style workflows.
+## Project Status and Contributions
+Birder CLIP is an early alpha project. APIs, model names, checkpoints, training recipes and command-line options may change without notice.
+This is currently a personal project in active development. Suggestions, bug reports and feedback are welcome through the project's issue tracker, but the project is not yet stable enough for broad external contributions.
+## Licenses
+The code in this project is primarily licensed under Apache 2.0. See [LICENSE](LICENSE) for details.
+Some model implementations, pretrained weights, tokenizers and converted artifacts may be derived from or depend on projects and datasets with their own licenses and usage restrictions.
+**You are responsible for ensuring compliance with all licenses and conditions of any dependent licenses.**
+### Disclaimer
+If you intend to use Birder CLIP, its pretrained weights, or any associated datasets in a commercial product, we strongly recommend seeking legal advice to ensure compliance with all relevant licenses and terms of use.
+It's the user's responsibility to ensure that their use of this project, including any pretrained weights or datasets, complies with all applicable licenses and legal requirements.
+## Acknowledgments
+Birder CLIP owes much to the work of others in computer vision, image-text representation learning and open-source machine learning.
+Special thanks to the [OpenCLIP](https://github.com/mlfoundations/open_clip) project, which serves as the main reference implementation and inspiration for much of the CLIP-style modeling and training work here. The same principle as in Birder applies: this project stands on the shoulders of many open-source projects, papers and datasets. If an attribution is missing, please open an issue.

{birder_clip-0.0.2.dev5 → birder_clip-0.0.2.dev7}/birder_clip/common/lib.py RENAMED Viewed

@@ -43,7 +43,7 @@ def get_image_text_network_name(
     parts = [network]
     if image_encoder is not None:
         parts.append(image_encoder)
-    if text_encoder is not None and text_encoder != "text_transformer":
+    if text_encoder is not None and text_encoder != "transformer_encoder":
         parts.append(text_encoder)
     if registry.exists(network) is True:

{birder_clip-0.0.2.dev5 → birder_clip-0.0.2.dev7}/birder_clip/common/training_cli.py RENAMED Viewed

@@ -21,6 +21,11 @@ def add_model_args(parser: argparse.ArgumentParser) -> None:
     parser.add_argument("-n", "--network", type=str, help="the image-text network to train")
     parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument("--image-encoder", type=str, help="the image encoder to use")
+    parser.add_argument(
+        "--image-encoder-pretrained",
+        type=str,
+        help="pretrained Birder image model weights path to load into the image encoder",
+    )
     parser.add_argument("--text-encoder", type=str, help="the text encoder to use")
     parser.add_argument("--embed-dim", type=int, metavar="N", help="shared image-text embedding dimension")
     parser.add_argument("--tokenizer", type=str, help="the tokenizer to use")
@@ -43,7 +48,23 @@ def add_model_args(parser: argparse.ArgumentParser) -> None:
 def add_loss_args(parser: argparse.ArgumentParser) -> None:
     group = parser.add_argument_group("Loss parameters")
-    group.add_argument("--loss", type=str, choices=["clip"], default="clip", help="loss function to use")
+    group.add_argument("--loss", type=str, choices=["clip", "coca"], default="clip", help="loss function to use")
+    group.add_argument(
+        "--coca-caption-loss-weight", type=float, default=1.0, help="weight assigned to CoCa caption loss"
+    )
+    group.add_argument(
+        "--coca-contrastive-loss-weight", type=float, default=1.0, help="weight assigned to CoCa contrastive loss"
+    )
+def add_freeze_args(parser: argparse.ArgumentParser) -> None:
+    group = parser.add_argument_group("Freeze parameters")
+    group.add_argument(
+        "--freeze-image-encoder",
+        default=False,
+        action="store_true",
+        help="freeze image encoder body, leaving the projection head trainable",
+    )
 def add_optimization_args(parser: argparse.ArgumentParser, default_batch_size: int = 32) -> None:
@@ -66,7 +87,12 @@ def add_optimization_args(parser: argparse.ArgumentParser, default_batch_size: i
         metavar="N",
         help="number of iterations to accumulate gradients per optimizer step",
     )
-    # NOTE: Add flag for negative sample caching in grad accum mode
+    group.add_argument(
+        "--grad-accum-cache-negatives",
+        default=False,
+        action="store_true",
+        help="cache features so CLIP loss uses all accumulated microbatches as negatives",
+    )
 def add_lr_wd_args(parser: argparse.ArgumentParser) -> None:
@@ -250,6 +276,8 @@ def add_data_aug_args(
     )
     group.add_argument("--ra-magnitude", type=int, default=9, help="magnitude for all the RandAugment transformations")
     group.add_argument("--augmix-severity", type=int, default=3, help="severity of AugMix policy")
+    group.add_argument("--clip-color-jitter-prob", type=float, default=0.0, help="CLIP color jitter probability")
+    group.add_argument("--clip-gray-prob", type=float, default=0.0, help="CLIP grayscale probability")
     group.add_argument("--resize-min-scale", type=float, default=default_min_scale, help="random resize min scale")
     group.add_argument(
         "--re-prob",
@@ -356,6 +384,29 @@ def add_precision_args(parser: argparse.ArgumentParser) -> None:
     )
+def add_grad_checkpointing_args(parser: argparse.ArgumentParser) -> None:
+    group = parser.add_argument_group("Gradient checkpointing parameters")
+    group.add_argument(
+        "--grad-checkpointing",
+        default=False,
+        action="store_true",
+        help="enable gradient checkpointing for supported models",
+    )
+    group.add_argument(
+        "--grad-checkpointing-segments",
+        type=int,
+        metavar="N",
+        help="number of checkpoint segments to request from supported models",
+    )
+    group.add_argument(
+        "--no-grad-checkpointing-preserve-rng-state",
+        dest="grad_checkpointing_preserve_rng_state",
+        default=True,
+        action="store_false",
+        help="disable RNG state preservation during gradient checkpointing recomputation",
+    )
 def add_compile_args(parser: argparse.ArgumentParser) -> None:
     group = parser.add_argument_group("Compilation parameters")
     group.add_argument("--compile", default=False, action="store_true", help="enable compilation")
@@ -578,8 +629,12 @@ def common_args_validation(args: argparse.Namespace) -> None:
         raise cli.ValidationError("--load-states requires --resume-epoch to be set")
     if args.load_scheduler is True and args.resume_epoch is None:
         raise cli.ValidationError("--load-scheduler requires --resume-epoch to be set")
-    if hasattr(args, "pretrained") is True and args.pretrained is True and args.resume_epoch is not None:
+    if args.pretrained is True and args.resume_epoch is not None:
         raise cli.ValidationError("--pretrained cannot be used with --resume-epoch")
+    if args.image_encoder_pretrained is not None and args.resume_epoch is not None:
+        raise cli.ValidationError("--image-encoder-pretrained cannot be used with --resume-epoch")
+    if args.pretrained is True and args.image_encoder_pretrained is not None:
+        raise cli.ValidationError("--image-encoder-pretrained cannot be used with --pretrained")
     if args.freeze_bn is True and args.sync_bn is True:
         raise cli.ValidationError("--freeze-bn cannot be used with --sync-bn")
@@ -605,10 +660,29 @@ def common_args_validation(args: argparse.Namespace) -> None:
         raise cli.ValidationError("--context-length must be positive")
     if args.grad_accum_steps < 1:
         raise cli.ValidationError("--grad-accum-steps must be >= 1")
+    if args.grad_accum_cache_negatives is True and args.grad_accum_steps == 1:
+        raise cli.ValidationError("--grad-accum-cache-negatives requires --grad-accum-steps greater than 1")
+    if args.grad_accum_cache_negatives is True and args.loss == "coca":
+        raise cli.ValidationError("--grad-accum-cache-negatives is only supported with --loss clip")
+    if args.coca_caption_loss_weight < 0.0:
+        raise cli.ValidationError("--coca-caption-loss-weight must be non-negative")
+    if args.coca_contrastive_loss_weight < 0.0:
+        raise cli.ValidationError("--coca-contrastive-loss-weight must be non-negative")
+    # EMA
     if args.model_ema_steps < 1:
         raise cli.ValidationError("--model-ema-steps must be >= 1")
+    # Gradient checkpointing args
+    if args.grad_checkpointing_segments is not None and args.grad_checkpointing_segments < 1:
+        raise cli.ValidationError("--grad-checkpointing-segments must be >= 1")
+    if args.grad_checkpointing_segments is not None and args.grad_checkpointing is False:
+        raise cli.ValidationError("--grad-checkpointing-segments requires --grad-checkpointing")
     if args.distributed_mode == "fsdp":
+        if args.grad_checkpointing is True:
+            raise cli.ValidationError("--grad-checkpointing cannot be used with --distributed-mode fsdp")
         if args.sync_bn is True:
             raise cli.ValidationError("--sync-bn cannot be used with --distributed-mode fsdp")
         if args.find_unused_parameters is True:

birder_clip-0.0.2.dev7/birder_clip/inference/data_parallel.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+Inference-optimized multi-GPU parallelization for image-text models
+This module provides ZeroShotInferenceDataParallel, a CLIP-style zero-shot
+specialization of Birder's InferenceDataParallel.
+"""
+from typing import Optional
+import torch
+from birder.inference.data_parallel import InferenceDataParallel
+from birder_clip.inference.zero_shot import ZeroShotInference
+from birder_clip.net.base import BaseNet
+class ZeroShotInferenceDataParallel(InferenceDataParallel):
+    """
+    Distributes zero-shot image inference batches across multiple GPUs
+    This wrapper scatters the image batch across devices and keeps a replicated
+    copy of the zero-shot text embeddings on each device. Each replica computes
+    image embeddings and zero-shot logits locally before outputs are gathered.
+    Important
+    ---------
+    This class assumes the model is already configured for inference mode
+    (i.e., loaded with inference=True in load_model or manually set to eval mode
+    with requires_grad=False on all parameters).
+    """
+    def __init__(
+        self,
+        module: BaseNet,
+        text_embeddings: torch.Tensor,
+        device_ids: Optional[list[int]] = None,
+        output_device: Optional[int | str | torch.device] = None,
+        compile_replicas: bool = False,
+        compile_methods: Optional[list[str]] = None,
+        compile_mode: Optional[str] = None,
+    ) -> None:
+        if compile_methods is None:
+            compile_methods = ["encode_image", "forward_logits"]
+        super().__init__(
+            module,
+            device_ids=device_ids,
+            output_device=output_device,
+            compile_replicas=compile_replicas,
+            compile_methods=compile_methods,
+            compile_mode=compile_mode,
+        )
+        self.set_text_embeddings(text_embeddings)
+    def set_text_embeddings(self, text_embeddings: torch.Tensor) -> None:
+        if text_embeddings.ndim != 2:
+            raise ValueError(
+                f"text_embeddings must be a 2D tensor of shape (num_classes, embedding_size), "
+                f"got shape {text_embeddings.size()}"
+            )
+        self.text_embeddings = [
+            text_embeddings.to(f"cuda:{device_id}", non_blocking=True) for device_id in self.device_ids
+        ]
+        self.inference_modules = [
+            ZeroShotInference(replica, embeddings) for replica, embeddings in zip(self.replicas, self.text_embeddings)
+        ]
+    def forward(  # type: ignore[override] # pylint: disable=arguments-differ
+        self, inputs: torch.Tensor, *, tta: bool = False, return_logits: bool = False
+    ) -> torch.Tensor:
+        """
+        Run zero-shot inference distributed across GPUs
+        Parameters
+        ----------
+        inputs
+            Input image batch to process.
+        tta
+            Run inference with oversampling.
+        return_logits
+            If True, return raw logits instead of probabilities after softmax.
+        """
+        if len(self.device_ids) == 1:
+            output = self.inference_modules[0](
+                inputs,
+                tta=tta,
+                return_logits=return_logits,
+            )
+            return self._gather([output])
+        scattered = self._scatter(inputs, {})
+        outputs = []
+        for inference, (input_chunk, _), device_id in zip(self.inference_modules, scattered, self.device_ids):
+            if input_chunk is not None and input_chunk.size(0) > 0:
+                with torch.cuda.device(device_id):
+                    output = inference(
+                        input_chunk,
+                        tta=tta,
+                        return_logits=return_logits,
+                    )
+                    outputs.append(output)
+            else:
+                outputs.append(None)
+        return self._gather(outputs)
+    def __repr__(self) -> str:
+        return (
+            f"ZeroShotInferenceDataParallel(\n"
+            f"  devices={self.device_ids},\n"
+            f"  output_device={self.output_device},\n"
+            f"  src_device={self.src_device},\n"
+            f"  text_embeddings_shape={tuple(self.text_embeddings[0].shape)}\n"
+            f")"
+        )

birder_clip-0.0.2.dev7/birder_clip/inference/image_embeddings.py ADDED Viewed

@@ -0,0 +1,63 @@
+import sys
+from collections.abc import Iterator
+from typing import Optional
+import numpy as np
+import numpy.typing as npt
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from birder_clip.net.base import BaseNet
+DataloaderInferenceResult = tuple[list[str], npt.NDArray[np.float32]]
+def infer_dataloader_iter(
+    device: torch.device,
+    net: BaseNet,
+    dataloader: DataLoader,
+    model_dtype: torch.dtype = torch.float32,
+    amp: bool = False,
+    amp_dtype: Optional[torch.dtype] = None,
+    num_samples: Optional[int] = None,
+    chunk_size: Optional[float] = None,
+) -> Iterator[DataloaderInferenceResult]:
+    if chunk_size is None:
+        chunk_size = float("inf")
+    net.to(device, dtype=model_dtype)
+    embeddings_list: list[npt.NDArray[np.float32]] = []
+    sample_paths: list[str] = []
+    sample_count = 0
+    with tqdm(total=num_samples, initial=0, unit="images", unit_scale=True, leave=False) as progress:
+        for file_paths, inputs, _targets in dataloader:
+            batch_size = inputs.size(0)
+            # Inference
+            inputs = inputs.to(device, dtype=model_dtype)
+            with torch.amp.autocast(device.type, enabled=amp, dtype=amp_dtype):
+                embeddings = net.encode_image(inputs, normalize=True)
+                embeddings = embeddings.cpu().float().numpy()
+            embeddings_list.append(embeddings)
+            # Set sample list
+            sample_paths.extend(file_paths)
+            # Update progress bar
+            progress.update(n=batch_size)
+            # Yield results when we reach chunk_size
+            sample_count += batch_size
+            if sample_count >= chunk_size:
+                with tqdm.external_write_mode(file=sys.stderr):
+                    yield (sample_paths, np.concatenate(embeddings_list, axis=0))
+                # Reset for next chunk
+                embeddings_list = []
+                sample_paths = []
+                sample_count = 0
+    if len(embeddings_list) > 0:
+        yield (sample_paths, np.concatenate(embeddings_list, axis=0))

{birder_clip-0.0.2.dev5 → birder_clip-0.0.2.dev7}/birder_clip/inference/zero_shot.py RENAMED Viewed

@@ -13,18 +13,57 @@ from collections.abc import Callable
 from collections.abc import Iterator
 from collections.abc import Sequence
 from typing import Optional
+from typing import Protocol
 import numpy as np
 import numpy.typing as npt
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
+from torchvision.transforms import v2
+from torchvision.transforms.v2.functional import five_crop
 from tqdm import tqdm
 from birder_clip.net.base import BaseNet
 from birder_clip.tokenizers.base import Tokenizer
+class ZeroShotInferenceModule(Protocol):
+    def __call__(self, inputs: torch.Tensor, *, tta: bool = False, return_logits: bool = False) -> torch.Tensor: ...
+class ZeroShotInference:
+    def __init__(self, net: BaseNet, text_embeddings: torch.Tensor) -> None:
+        self.net = net
+        self.text_embeddings = text_embeddings
+    def __call__(self, inputs: torch.Tensor, *, tta: bool = False, return_logits: bool = False) -> torch.Tensor:
+        inputs = inputs.to(self.text_embeddings.device, non_blocking=True)
+        if tta is True:
+            _, _, H, W = inputs.size()
+            crop_h = int(H * 0.8)
+            crop_w = int(W * 0.8)
+            tta_inputs = five_crop(inputs, size=[crop_h, crop_w])
+            t = v2.Resize((H, W), interpolation=v2.InterpolationMode.BICUBIC, antialias=True)
+            outs = []
+            for tta_input in tta_inputs:
+                image_embeddings = self.net.encode_image(t(tta_input), normalize=True)
+                logits = self.net.forward_logits(image_embeddings, self.text_embeddings)
+                if return_logits is True:
+                    outs.append(logits)
+                else:
+                    outs.append(F.softmax(logits, dim=-1))
+            return torch.stack(outs).mean(dim=0)
+        image_embeddings = self.net.encode_image(inputs, normalize=True)
+        logits = self.net.forward_logits(image_embeddings, self.text_embeddings)
+        if return_logits is True:
+            return logits
+        return F.softmax(logits, dim=-1)
 def render_prompts(class_names: Sequence[str], templates: Sequence[str]) -> list[str]:
     return [template.format(class_name) for class_name in class_names for template in templates]
@@ -66,9 +105,9 @@ DataloaderInferenceResult = tuple[list[str], npt.NDArray[np.float32], npt.NDArra
 def infer_dataloader_iter(
     device: torch.device,
-    net: BaseNet | torch.ScriptModule,
+    net: ZeroShotInferenceModule,
     dataloader: DataLoader,
-    text_embeddings: torch.Tensor,
+    tta: bool = False,
     return_logits: bool = False,
     model_dtype: torch.dtype = torch.float32,
     amp: bool = False,
@@ -80,7 +119,6 @@ def infer_dataloader_iter(
     if chunk_size is None:
         chunk_size = float("inf")
-    net.to(device, dtype=model_dtype)
     out_list: list[npt.NDArray[np.float32]] = []
     labels_list: list[npt.NDArray[np.int64]] = []
     sample_paths: list[str] = []
@@ -90,14 +128,10 @@ def infer_dataloader_iter(
             batch_size = inputs.size(0)
             # Inference
-            inputs = inputs.to(device, dtype=model_dtype)
+            inputs = inputs.to(dtype=model_dtype)
             with torch.amp.autocast(device.type, enabled=amp, dtype=amp_dtype):
-                image_embeddings = net.encode_image(inputs, normalize=True)
-                logits = net.forward_logits(image_embeddings, text_embeddings)
-                if return_logits is True:
-                    out = logits.cpu().float().numpy()
-                else:
-                    out = F.softmax(logits, dim=-1).cpu().float().numpy()
+                out = net(inputs, return_logits=return_logits, tta=tta)
+                out = out.cpu().float().numpy()
             out_list.append(out)

{birder_clip-0.0.2.dev5 → birder_clip-0.0.2.dev7}/birder_clip/loss/__init__.py RENAMED Viewed

@@ -1,5 +1,7 @@
+from birder_clip.loss.coca import CoCaLoss
 from birder_clip.loss.contrastive import CLIPLoss
 __all__ = [
+    "CoCaLoss",
     "CLIPLoss",
 ]

birder-clip 0.0.2.dev5__tar.gz → 0.0.2.dev7__tar.gz

birder-clip 0.0.2.dev5tar.gz → 0.0.2.dev7tar.gz