birder-clip 0.0.2.dev4__tar.gz → 0.0.2.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder_clip-0.0.2.dev6/PKG-INFO +151 -0
- birder_clip-0.0.2.dev6/README.md +89 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/common/fs_ops.py +100 -3
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/common/lib.py +2 -2
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/common/training_cli.py +145 -5
- birder_clip-0.0.2.dev6/birder_clip/common/training_utils.py +99 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/data/datasets/webdataset.py +2 -2
- birder_clip-0.0.2.dev6/birder_clip/inference/zero_shot.py +155 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/loss/__init__.py +2 -0
- birder_clip-0.0.2.dev6/birder_clip/loss/coca.py +66 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/model_registry/model_registry.py +40 -19
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/net/__init__.py +2 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/net/base.py +14 -2
- birder_clip-0.0.2.dev6/birder_clip/net/clip.py +719 -0
- birder_clip-0.0.2.dev6/birder_clip/net/coca.py +397 -0
- birder_clip-0.0.2.dev6/birder_clip/net/text/__init__.py +9 -0
- birder_clip-0.0.2.dev6/birder_clip/net/text/base.py +142 -0
- birder_clip-0.0.2.dev6/birder_clip/net/text/conditioned_decoder.py +441 -0
- birder_clip-0.0.2.dev4/birder_clip/net/text/transformer.py → birder_clip-0.0.2.dev6/birder_clip/net/text/encoder.py +135 -19
- birder_clip-0.0.2.dev6/birder_clip/net/text/hf.py +154 -0
- birder_clip-0.0.2.dev6/birder_clip/net/text/prefix_decoder.py +1 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/scripts/train.py +298 -53
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/scripts/zero_shot.py +268 -182
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/__init__.py +2 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/base.py +4 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/hf.py +15 -0
- birder_clip-0.0.2.dev6/birder_clip/tokenizers/openvision.py +66 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/simple_tokenizer.py +2 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tools/__main__.py +13 -0
- birder_clip-0.0.2.dev6/birder_clip/tools/convert_model.py +268 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tools/download_tokenizer.py +5 -4
- birder_clip-0.0.2.dev6/birder_clip/tools/list_models.py +105 -0
- birder_clip-0.0.2.dev6/birder_clip/tools/model_info.py +145 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tools/show_iterator.py +14 -13
- birder_clip-0.0.2.dev6/birder_clip/tools/stats.py +210 -0
- birder_clip-0.0.2.dev6/birder_clip/version.py +1 -0
- birder_clip-0.0.2.dev6/birder_clip.egg-info/PKG-INFO +151 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip.egg-info/SOURCES.txt +12 -1
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip.egg-info/requires.txt +3 -3
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/requirements/_requirements-dev.txt +2 -2
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/requirements/requirements.txt +1 -1
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/tests/test_common.py +3 -3
- birder_clip-0.0.2.dev6/tests/test_loss.py +129 -0
- birder_clip-0.0.2.dev6/tests/test_model_registry.py +102 -0
- birder_clip-0.0.2.dev6/tests/test_net.py +572 -0
- birder_clip-0.0.2.dev6/tests/test_net_text.py +96 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/tests/test_tokenizers.py +29 -0
- birder_clip-0.0.2.dev4/PKG-INFO +0 -72
- birder_clip-0.0.2.dev4/README.md +0 -10
- birder_clip-0.0.2.dev4/birder_clip/common/training_utils.py +0 -61
- birder_clip-0.0.2.dev4/birder_clip/inference/zero_shot.py +0 -54
- birder_clip-0.0.2.dev4/birder_clip/net/clip.py +0 -282
- birder_clip-0.0.2.dev4/birder_clip/net/text/__init__.py +0 -5
- birder_clip-0.0.2.dev4/birder_clip/net/text/base.py +0 -45
- birder_clip-0.0.2.dev4/birder_clip/version.py +0 -1
- birder_clip-0.0.2.dev4/birder_clip.egg-info/PKG-INFO +0 -72
- birder_clip-0.0.2.dev4/tests/test_loss.py +0 -47
- birder_clip-0.0.2.dev4/tests/test_model_registry.py +0 -74
- birder_clip-0.0.2.dev4/tests/test_net.py +0 -147
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/LICENSE +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/common/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/conf/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/conf/settings.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/data/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/data/datasets/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/data/datasets/csv.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/data/datasets/fake.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/inference/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/inference/zero_shot_templates.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/loss/contrastive.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/model_registry/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/model_registry/manifest.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/py.typed +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/scripts/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/bpe_simple_vocab_16e6.txt.gz +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tokenizers/registry.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip/tools/__init__.py +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip.egg-info/dependency_links.txt +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/birder_clip.egg-info/top_level.txt +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/pyproject.toml +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/setup.cfg +0 -0
- {birder_clip-0.0.2.dev4 → birder_clip-0.0.2.dev6}/tests/test_datasets.py +0 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: birder_clip
|
|
3
|
+
Version: 0.0.2.dev6
|
|
4
|
+
Summary: A Birder extension for CLIP-style image-text modeling and multimodal computer vision workflows.
|
|
5
|
+
Author: Ofer Hasson
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/birder/birder-clip
|
|
8
|
+
Project-URL: Issues, https://gitlab.com/birder/birder-clip/-/issues
|
|
9
|
+
Project-URL: Changelog, https://gitlab.com/birder/birder-clip/-/blob/main/CHANGELOG.md
|
|
10
|
+
Keywords: computer-vision,clip,image-text,pytorch,deep-learning,artificial intelligence
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Education
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
20
|
+
Classifier: Topic :: Software Development
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: birder>=0.6.0
|
|
28
|
+
Requires-Dist: ftfy>=6.3.1
|
|
29
|
+
Requires-Dist: regex>=2025.7.29
|
|
30
|
+
Requires-Dist: tqdm>=4.67.0
|
|
31
|
+
Requires-Dist: webdataset>=0.2.111
|
|
32
|
+
Requires-Dist: huggingface_hub
|
|
33
|
+
Requires-Dist: transformers
|
|
34
|
+
Requires-Dist: torch>=2.10.0
|
|
35
|
+
Requires-Dist: torchvision
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: bandit~=1.9.4; extra == "dev"
|
|
38
|
+
Requires-Dist: black~=26.5.0; extra == "dev"
|
|
39
|
+
Requires-Dist: build~=1.5.0; extra == "dev"
|
|
40
|
+
Requires-Dist: bumpver~=2026.1132; extra == "dev"
|
|
41
|
+
Requires-Dist: coverage~=7.14.1; extra == "dev"
|
|
42
|
+
Requires-Dist: debugpy; extra == "dev"
|
|
43
|
+
Requires-Dist: flake8-pep585~=0.1.7; extra == "dev"
|
|
44
|
+
Requires-Dist: flake8~=7.3.0; extra == "dev"
|
|
45
|
+
Requires-Dist: invoke~=3.0.3; extra == "dev"
|
|
46
|
+
Requires-Dist: ipython; extra == "dev"
|
|
47
|
+
Requires-Dist: isort~=8.0.1; extra == "dev"
|
|
48
|
+
Requires-Dist: mkdocs~=1.6.1; extra == "dev"
|
|
49
|
+
Requires-Dist: mkdocs-exclude~=1.0.2; extra == "dev"
|
|
50
|
+
Requires-Dist: mypy~=2.1.0; extra == "dev"
|
|
51
|
+
Requires-Dist: parameterized~=0.9.0; extra == "dev"
|
|
52
|
+
Requires-Dist: pylint~=4.0.6; extra == "dev"
|
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
|
54
|
+
Requires-Dist: requests~=2.34.2; extra == "dev"
|
|
55
|
+
Requires-Dist: safetensors~=0.7.0; extra == "dev"
|
|
56
|
+
Requires-Dist: setuptools; extra == "dev"
|
|
57
|
+
Requires-Dist: twine~=6.2.0; extra == "dev"
|
|
58
|
+
Requires-Dist: types-requests~=2.33.0; extra == "dev"
|
|
59
|
+
Requires-Dist: urllib3~=2.7.0; extra == "dev"
|
|
60
|
+
Requires-Dist: wheel; extra == "dev"
|
|
61
|
+
Dynamic: license-file
|
|
62
|
+
|
|
63
|
+
# Birder CLIP
|
|
64
|
+
|
|
65
|
+
Birder CLIP is an early-stage Birder extension for CLIP-style image-text models, focused on practical inference and fine-tuning workflows.
|
|
66
|
+
|
|
67
|
+
- [Introduction](#introduction)
|
|
68
|
+
- [Setup](#setup)
|
|
69
|
+
- [Getting Started](#getting-started)
|
|
70
|
+
- [Training](#training)
|
|
71
|
+
- [Project Status and Contributions](#project-status-and-contributions)
|
|
72
|
+
- [Licenses](#licenses)
|
|
73
|
+
- [Acknowledgments](#acknowledgments)
|
|
74
|
+
|
|
75
|
+
## Introduction
|
|
76
|
+
|
|
77
|
+
Birder CLIP extends [Birder](https://gitlab.com/birder/birder) with image-text models for zero-shot classification, image-text retrieval style workflows, caption generation and related multimodal computer vision tasks.
|
|
78
|
+
|
|
79
|
+
The project is aimed at image-text modeling rather than general vision-language model (VLM) chat or instruction-following systems.
|
|
80
|
+
It currently includes CLIP-style components, tokenizers, model registry utilities, inference scripts and training code.
|
|
81
|
+
Full training is supported, but for large-scale CLIP pretraining you are probably better served by [OpenCLIP](https://github.com/mlfoundations/open_clip).
|
|
82
|
+
|
|
83
|
+
## Setup
|
|
84
|
+
|
|
85
|
+
1. Ensure your environment meets the minimum requirements:
|
|
86
|
+
- Python 3.11 or newer
|
|
87
|
+
- PyTorch 2.10 or newer (installed for your hardware/driver stack)
|
|
88
|
+
- Birder 0.6.0 or newer
|
|
89
|
+
|
|
90
|
+
1. Install the latest Birder CLIP version:
|
|
91
|
+
|
|
92
|
+
```sh
|
|
93
|
+
pip install birder-clip
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Getting Started
|
|
97
|
+
|
|
98
|
+
List available image-text models:
|
|
99
|
+
|
|
100
|
+
```sh
|
|
101
|
+
python -m birder_clip.tools list-models --image-text
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
List available pretrained weights:
|
|
105
|
+
|
|
106
|
+
```sh
|
|
107
|
+
python -m birder_clip.tools list-models --pretrained --verbose
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Run zero-shot classification on a directory of images:
|
|
111
|
+
|
|
112
|
+
```sh
|
|
113
|
+
python -m birder_clip.scripts.zero_shot -n laion_clip_vit_l14 --classes eagle hawk falcon --template-set default --gpu data/images
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
For detailed options, run:
|
|
117
|
+
|
|
118
|
+
```sh
|
|
119
|
+
python -m birder_clip.scripts.zero_shot --help
|
|
120
|
+
python -m birder_clip.tools --help
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Training
|
|
124
|
+
|
|
125
|
+
Birder CLIP includes training support for image-text datasets in CSV and WebDataset formats, including CLIP, CoCa and LiT-style workflows.
|
|
126
|
+
|
|
127
|
+
## Project Status and Contributions
|
|
128
|
+
|
|
129
|
+
Birder CLIP is an early alpha project. APIs, model names, checkpoints, training recipes and command-line options may change without notice.
|
|
130
|
+
|
|
131
|
+
This is currently a personal project in active development. Suggestions, bug reports and feedback are welcome through the project's issue tracker, but the project is not yet stable enough for broad external contributions.
|
|
132
|
+
|
|
133
|
+
## Licenses
|
|
134
|
+
|
|
135
|
+
The code in this project is primarily licensed under Apache 2.0. See [LICENSE](LICENSE) for details.
|
|
136
|
+
|
|
137
|
+
Some model implementations, pretrained weights, tokenizers and converted artifacts may be derived from or depend on projects and datasets with their own licenses and usage restrictions.
|
|
138
|
+
|
|
139
|
+
**You are responsible for ensuring compliance with all licenses and conditions of any dependent licenses.**
|
|
140
|
+
|
|
141
|
+
### Disclaimer
|
|
142
|
+
|
|
143
|
+
If you intend to use Birder CLIP, its pretrained weights, or any associated datasets in a commercial product, we strongly recommend seeking legal advice to ensure compliance with all relevant licenses and terms of use.
|
|
144
|
+
|
|
145
|
+
It's the user's responsibility to ensure that their use of this project, including any pretrained weights or datasets, complies with all applicable licenses and legal requirements.
|
|
146
|
+
|
|
147
|
+
## Acknowledgments
|
|
148
|
+
|
|
149
|
+
Birder CLIP owes much to the work of others in computer vision, image-text representation learning and open-source machine learning.
|
|
150
|
+
|
|
151
|
+
Special thanks to the [OpenCLIP](https://github.com/mlfoundations/open_clip) project, which serves as the main reference implementation and inspiration for much of the CLIP-style modeling and training work here. The same principle as in Birder applies: this project stands on the shoulders of many open-source projects, papers and datasets. If an attribution is missing, please open an issue.
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Birder CLIP
|
|
2
|
+
|
|
3
|
+
Birder CLIP is an early-stage Birder extension for CLIP-style image-text models, focused on practical inference and fine-tuning workflows.
|
|
4
|
+
|
|
5
|
+
- [Introduction](#introduction)
|
|
6
|
+
- [Setup](#setup)
|
|
7
|
+
- [Getting Started](#getting-started)
|
|
8
|
+
- [Training](#training)
|
|
9
|
+
- [Project Status and Contributions](#project-status-and-contributions)
|
|
10
|
+
- [Licenses](#licenses)
|
|
11
|
+
- [Acknowledgments](#acknowledgments)
|
|
12
|
+
|
|
13
|
+
## Introduction
|
|
14
|
+
|
|
15
|
+
Birder CLIP extends [Birder](https://gitlab.com/birder/birder) with image-text models for zero-shot classification, image-text retrieval style workflows, caption generation and related multimodal computer vision tasks.
|
|
16
|
+
|
|
17
|
+
The project is aimed at image-text modeling rather than general vision-language model (VLM) chat or instruction-following systems.
|
|
18
|
+
It currently includes CLIP-style components, tokenizers, model registry utilities, inference scripts and training code.
|
|
19
|
+
Full training is supported, but for large-scale CLIP pretraining you are probably better served by [OpenCLIP](https://github.com/mlfoundations/open_clip).
|
|
20
|
+
|
|
21
|
+
## Setup
|
|
22
|
+
|
|
23
|
+
1. Ensure your environment meets the minimum requirements:
|
|
24
|
+
- Python 3.11 or newer
|
|
25
|
+
- PyTorch 2.10 or newer (installed for your hardware/driver stack)
|
|
26
|
+
- Birder 0.6.0 or newer
|
|
27
|
+
|
|
28
|
+
1. Install the latest Birder CLIP version:
|
|
29
|
+
|
|
30
|
+
```sh
|
|
31
|
+
pip install birder-clip
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Getting Started
|
|
35
|
+
|
|
36
|
+
List available image-text models:
|
|
37
|
+
|
|
38
|
+
```sh
|
|
39
|
+
python -m birder_clip.tools list-models --image-text
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
List available pretrained weights:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
python -m birder_clip.tools list-models --pretrained --verbose
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Run zero-shot classification on a directory of images:
|
|
49
|
+
|
|
50
|
+
```sh
|
|
51
|
+
python -m birder_clip.scripts.zero_shot -n laion_clip_vit_l14 --classes eagle hawk falcon --template-set default --gpu data/images
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
For detailed options, run:
|
|
55
|
+
|
|
56
|
+
```sh
|
|
57
|
+
python -m birder_clip.scripts.zero_shot --help
|
|
58
|
+
python -m birder_clip.tools --help
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Training
|
|
62
|
+
|
|
63
|
+
Birder CLIP includes training support for image-text datasets in CSV and WebDataset formats, including CLIP, CoCa and LiT-style workflows.
|
|
64
|
+
|
|
65
|
+
## Project Status and Contributions
|
|
66
|
+
|
|
67
|
+
Birder CLIP is an early alpha project. APIs, model names, checkpoints, training recipes and command-line options may change without notice.
|
|
68
|
+
|
|
69
|
+
This is currently a personal project in active development. Suggestions, bug reports and feedback are welcome through the project's issue tracker, but the project is not yet stable enough for broad external contributions.
|
|
70
|
+
|
|
71
|
+
## Licenses
|
|
72
|
+
|
|
73
|
+
The code in this project is primarily licensed under Apache 2.0. See [LICENSE](LICENSE) for details.
|
|
74
|
+
|
|
75
|
+
Some model implementations, pretrained weights, tokenizers and converted artifacts may be derived from or depend on projects and datasets with their own licenses and usage restrictions.
|
|
76
|
+
|
|
77
|
+
**You are responsible for ensuring compliance with all licenses and conditions of any dependent licenses.**
|
|
78
|
+
|
|
79
|
+
### Disclaimer
|
|
80
|
+
|
|
81
|
+
If you intend to use Birder CLIP, its pretrained weights, or any associated datasets in a commercial product, we strongly recommend seeking legal advice to ensure compliance with all relevant licenses and terms of use.
|
|
82
|
+
|
|
83
|
+
It's the user's responsibility to ensure that their use of this project, including any pretrained weights or datasets, complies with all applicable licenses and legal requirements.
|
|
84
|
+
|
|
85
|
+
## Acknowledgments
|
|
86
|
+
|
|
87
|
+
Birder CLIP owes much to the work of others in computer vision, image-text representation learning and open-source machine learning.
|
|
88
|
+
|
|
89
|
+
Special thanks to the [OpenCLIP](https://github.com/mlfoundations/open_clip) project, which serves as the main reference implementation and inspiration for much of the CLIP-style modeling and training work here. The same principle as in Birder applies: this project stands on the shoulders of many open-source projects, papers and datasets. If an attribution is missing, please open an issue.
|
|
@@ -67,7 +67,7 @@ def model_path(
|
|
|
67
67
|
network_name: str,
|
|
68
68
|
*,
|
|
69
69
|
epoch: Optional[int | str] = None,
|
|
70
|
-
|
|
70
|
+
st: bool = False,
|
|
71
71
|
states: bool = False,
|
|
72
72
|
) -> Path:
|
|
73
73
|
if epoch is not None:
|
|
@@ -77,8 +77,10 @@ def model_path(
|
|
|
77
77
|
|
|
78
78
|
if states is True:
|
|
79
79
|
file_name = f"{file_name}_states.pt"
|
|
80
|
+
elif st is True:
|
|
81
|
+
file_name = f"{file_name}.safetensors"
|
|
80
82
|
else:
|
|
81
|
-
file_name = f"{file_name}.
|
|
83
|
+
file_name = f"{file_name}.pt"
|
|
82
84
|
|
|
83
85
|
return settings.MODELS_DIR.joinpath(file_name)
|
|
84
86
|
|
|
@@ -109,6 +111,30 @@ def _checkpoint_states(
|
|
|
109
111
|
torch.save(kwargs, states_path)
|
|
110
112
|
|
|
111
113
|
|
|
114
|
+
def _checkpoint_states_from_state_dicts(
|
|
115
|
+
states_path: Path,
|
|
116
|
+
optimizer_state: Optional[dict[str, Any]],
|
|
117
|
+
scheduler_state: Optional[dict[str, Any]],
|
|
118
|
+
scaler_state: Optional[dict[str, Any]],
|
|
119
|
+
model_base_state: Optional[dict[str, Any]],
|
|
120
|
+
**extra_states: Optional[dict[str, Any]],
|
|
121
|
+
) -> None:
|
|
122
|
+
if optimizer_state is None or scheduler_state is None:
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
logger.info(f"Saving checkpoint states {states_path}...")
|
|
126
|
+
torch.save(
|
|
127
|
+
{
|
|
128
|
+
"optimizer_state": optimizer_state,
|
|
129
|
+
"scheduler_state": scheduler_state,
|
|
130
|
+
"scaler_state": scaler_state,
|
|
131
|
+
"model_base_state": model_base_state,
|
|
132
|
+
**extra_states,
|
|
133
|
+
},
|
|
134
|
+
states_path,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
112
138
|
class TrainingStates(NamedTuple):
|
|
113
139
|
optimizer_state: Optional[dict[str, Any]]
|
|
114
140
|
scheduler_state: Optional[dict[str, Any]]
|
|
@@ -182,6 +208,50 @@ def checkpoint_model(
|
|
|
182
208
|
_checkpoint_states(states_path, optimizer, scheduler, scaler, model_base, **extra_states)
|
|
183
209
|
|
|
184
210
|
|
|
211
|
+
def checkpoint_model_from_state_dicts(
|
|
212
|
+
network_name: str,
|
|
213
|
+
epoch: int,
|
|
214
|
+
model_state: dict[str, Any],
|
|
215
|
+
task: Any,
|
|
216
|
+
signature: SignatureType,
|
|
217
|
+
rgb_stats: RGBType,
|
|
218
|
+
optimizer_state: Optional[dict[str, Any]],
|
|
219
|
+
scheduler_state: Optional[dict[str, Any]],
|
|
220
|
+
scaler_state: Optional[dict[str, Any]],
|
|
221
|
+
model_base_state: Optional[dict[str, Any]],
|
|
222
|
+
*,
|
|
223
|
+
external_config: Optional[dict[str, Any]] = None,
|
|
224
|
+
**extra_states: Optional[dict[str, Any]],
|
|
225
|
+
) -> None:
|
|
226
|
+
kwargs = {}
|
|
227
|
+
if external_config is not None:
|
|
228
|
+
kwargs["config"] = external_config
|
|
229
|
+
|
|
230
|
+
path = model_path(network_name, epoch=epoch)
|
|
231
|
+
states_path = model_path(network_name, epoch=epoch, states=True)
|
|
232
|
+
logger.info(f"Saving model checkpoint {path}...")
|
|
233
|
+
torch.save(
|
|
234
|
+
{
|
|
235
|
+
"state": model_state,
|
|
236
|
+
"birder_clip_version": __version__,
|
|
237
|
+
"task": task,
|
|
238
|
+
"signature": signature,
|
|
239
|
+
"rgb_stats": rgb_stats,
|
|
240
|
+
**kwargs,
|
|
241
|
+
},
|
|
242
|
+
path,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
_checkpoint_states_from_state_dicts(
|
|
246
|
+
states_path,
|
|
247
|
+
optimizer_state,
|
|
248
|
+
scheduler_state,
|
|
249
|
+
scaler_state,
|
|
250
|
+
model_base_state,
|
|
251
|
+
**extra_states,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
185
255
|
def clean_checkpoints(network_name: str, keep_last: int) -> None:
|
|
186
256
|
epoch = "*[0-9]"
|
|
187
257
|
models_glob = str(model_path(network_name, epoch=epoch))
|
|
@@ -314,7 +384,7 @@ def load_model(
|
|
|
314
384
|
embed_dim=embed_dim,
|
|
315
385
|
tokenizer=tokenizer,
|
|
316
386
|
)
|
|
317
|
-
path = model_path(_network_name, epoch=epoch,
|
|
387
|
+
path = model_path(_network_name, epoch=epoch, st=st)
|
|
318
388
|
|
|
319
389
|
logger.info(f"Loading model from {path} on device {device}...")
|
|
320
390
|
|
|
@@ -589,6 +659,33 @@ def load_pretrained_tokenizer(weights: str, *, download: bool = True, **kwargs:
|
|
|
589
659
|
return get_tokenizer(tokenizer_name, **tokenizer_kwargs)
|
|
590
660
|
|
|
591
661
|
|
|
662
|
+
def save_st(
|
|
663
|
+
net: torch.nn.Module,
|
|
664
|
+
dst: str,
|
|
665
|
+
task: str,
|
|
666
|
+
signature: SignatureType,
|
|
667
|
+
rgb_stats: RGBType,
|
|
668
|
+
*,
|
|
669
|
+
external_config: Optional[dict[str, Any]] = None,
|
|
670
|
+
) -> None:
|
|
671
|
+
assert _HAS_SAFETENSORS, "'pip install safetensors' to use .safetensors"
|
|
672
|
+
kwargs = {}
|
|
673
|
+
if external_config is not None:
|
|
674
|
+
kwargs["config"] = json.dumps(external_config)
|
|
675
|
+
|
|
676
|
+
safetensors.torch.save_model(
|
|
677
|
+
net,
|
|
678
|
+
str(dst),
|
|
679
|
+
{
|
|
680
|
+
"birder_clip_version": __version__,
|
|
681
|
+
"task": task,
|
|
682
|
+
"signature": json.dumps(signature),
|
|
683
|
+
"rgb_stats": json.dumps(rgb_stats),
|
|
684
|
+
**kwargs,
|
|
685
|
+
},
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
|
|
592
689
|
def download_model_by_weights(
|
|
593
690
|
weights: str, *, dst: Optional[str | Path] = None, file_format: FileFormatType = "pt", progress_bar: bool = True
|
|
594
691
|
) -> None:
|
|
@@ -43,7 +43,7 @@ def get_image_text_network_name(
|
|
|
43
43
|
parts = [network]
|
|
44
44
|
if image_encoder is not None:
|
|
45
45
|
parts.append(image_encoder)
|
|
46
|
-
if text_encoder is not None and text_encoder != "
|
|
46
|
+
if text_encoder is not None and text_encoder != "transformer_encoder":
|
|
47
47
|
parts.append(text_encoder)
|
|
48
48
|
|
|
49
49
|
if registry.exists(network) is True:
|
|
@@ -86,7 +86,7 @@ def get_image_text_model_config(
|
|
|
86
86
|
|
|
87
87
|
if config is not None:
|
|
88
88
|
for key, value in config.items():
|
|
89
|
-
if key in {"image", "text"} and isinstance(value, dict)
|
|
89
|
+
if key in {"image", "text"} and isinstance(value, dict):
|
|
90
90
|
model_config[key] = {**model_config.get(key, {}), **value}
|
|
91
91
|
else:
|
|
92
92
|
model_config[key] = value
|
|
@@ -21,6 +21,11 @@ def add_model_args(parser: argparse.ArgumentParser) -> None:
|
|
|
21
21
|
parser.add_argument("-n", "--network", type=str, help="the image-text network to train")
|
|
22
22
|
parser.add_argument("-t", "--tag", type=str, help="add model tag")
|
|
23
23
|
parser.add_argument("--image-encoder", type=str, help="the image encoder to use")
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--image-encoder-pretrained",
|
|
26
|
+
type=str,
|
|
27
|
+
help="pretrained Birder image model weights path to load into the image encoder",
|
|
28
|
+
)
|
|
24
29
|
parser.add_argument("--text-encoder", type=str, help="the text encoder to use")
|
|
25
30
|
parser.add_argument("--embed-dim", type=int, metavar="N", help="shared image-text embedding dimension")
|
|
26
31
|
parser.add_argument("--tokenizer", type=str, help="the tokenizer to use")
|
|
@@ -43,13 +48,31 @@ def add_model_args(parser: argparse.ArgumentParser) -> None:
|
|
|
43
48
|
|
|
44
49
|
def add_loss_args(parser: argparse.ArgumentParser) -> None:
|
|
45
50
|
group = parser.add_argument_group("Loss parameters")
|
|
46
|
-
group.add_argument("--loss", type=str, choices=["clip"], default="clip", help="loss function to use")
|
|
51
|
+
group.add_argument("--loss", type=str, choices=["clip", "coca"], default="clip", help="loss function to use")
|
|
52
|
+
group.add_argument(
|
|
53
|
+
"--coca-caption-loss-weight", type=float, default=1.0, help="weight assigned to CoCa caption loss"
|
|
54
|
+
)
|
|
55
|
+
group.add_argument(
|
|
56
|
+
"--coca-contrastive-loss-weight", type=float, default=1.0, help="weight assigned to CoCa contrastive loss"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def add_freeze_args(parser: argparse.ArgumentParser) -> None:
|
|
61
|
+
group = parser.add_argument_group("Freeze parameters")
|
|
62
|
+
group.add_argument(
|
|
63
|
+
"--freeze-image-encoder",
|
|
64
|
+
default=False,
|
|
65
|
+
action="store_true",
|
|
66
|
+
help="freeze image encoder body, leaving the projection head trainable",
|
|
67
|
+
)
|
|
47
68
|
|
|
48
69
|
|
|
49
70
|
def add_optimization_args(parser: argparse.ArgumentParser, default_batch_size: int = 32) -> None:
|
|
50
71
|
group = parser.add_argument_group("Optimization parameters")
|
|
51
72
|
group.add_argument("--batch-size", type=int, default=default_batch_size, metavar="N", help="the batch size")
|
|
52
|
-
group.add_argument(
|
|
73
|
+
group.add_argument(
|
|
74
|
+
"--opt", type=str, choices=list(get_args(OptimizerType)), default="adamw", help="optimizer to use"
|
|
75
|
+
)
|
|
53
76
|
group.add_argument("--opt-fused", default=False, action="store_true", help="use fused optimizer implementation")
|
|
54
77
|
group.add_argument("--momentum", type=float, default=0.9, metavar="M", help="optimizer momentum")
|
|
55
78
|
group.add_argument("--nesterov", default=False, action="store_true", help="use nesterov momentum")
|
|
@@ -64,7 +87,12 @@ def add_optimization_args(parser: argparse.ArgumentParser, default_batch_size: i
|
|
|
64
87
|
metavar="N",
|
|
65
88
|
help="number of iterations to accumulate gradients per optimizer step",
|
|
66
89
|
)
|
|
67
|
-
|
|
90
|
+
group.add_argument(
|
|
91
|
+
"--grad-accum-cache-negatives",
|
|
92
|
+
default=False,
|
|
93
|
+
action="store_true",
|
|
94
|
+
help="cache features so CLIP loss uses all accumulated microbatches as negatives",
|
|
95
|
+
)
|
|
68
96
|
|
|
69
97
|
|
|
70
98
|
def add_lr_wd_args(parser: argparse.ArgumentParser) -> None:
|
|
@@ -248,6 +276,8 @@ def add_data_aug_args(
|
|
|
248
276
|
)
|
|
249
277
|
group.add_argument("--ra-magnitude", type=int, default=9, help="magnitude for all the RandAugment transformations")
|
|
250
278
|
group.add_argument("--augmix-severity", type=int, default=3, help="severity of AugMix policy")
|
|
279
|
+
group.add_argument("--clip-color-jitter-prob", type=float, default=0.0, help="CLIP color jitter probability")
|
|
280
|
+
group.add_argument("--clip-gray-prob", type=float, default=0.0, help="CLIP grayscale probability")
|
|
251
281
|
group.add_argument("--resize-min-scale", type=float, default=default_min_scale, help="random resize min scale")
|
|
252
282
|
group.add_argument(
|
|
253
283
|
"--re-prob",
|
|
@@ -318,7 +348,13 @@ def add_dataloader_args(parser: argparse.ArgumentParser) -> None:
|
|
|
318
348
|
action="store_true",
|
|
319
349
|
help="keep dataloader worker processes alive between epochs",
|
|
320
350
|
)
|
|
321
|
-
group.add_argument(
|
|
351
|
+
group.add_argument(
|
|
352
|
+
"--no-drop-last",
|
|
353
|
+
dest="drop_last",
|
|
354
|
+
default=True,
|
|
355
|
+
action="store_false",
|
|
356
|
+
help="do not drop the last incomplete batch",
|
|
357
|
+
)
|
|
322
358
|
|
|
323
359
|
|
|
324
360
|
def add_precision_args(parser: argparse.ArgumentParser) -> None:
|
|
@@ -348,6 +384,29 @@ def add_precision_args(parser: argparse.ArgumentParser) -> None:
|
|
|
348
384
|
)
|
|
349
385
|
|
|
350
386
|
|
|
387
|
+
def add_grad_checkpointing_args(parser: argparse.ArgumentParser) -> None:
|
|
388
|
+
group = parser.add_argument_group("Gradient checkpointing parameters")
|
|
389
|
+
group.add_argument(
|
|
390
|
+
"--grad-checkpointing",
|
|
391
|
+
default=False,
|
|
392
|
+
action="store_true",
|
|
393
|
+
help="enable gradient checkpointing for supported models",
|
|
394
|
+
)
|
|
395
|
+
group.add_argument(
|
|
396
|
+
"--grad-checkpointing-segments",
|
|
397
|
+
type=int,
|
|
398
|
+
metavar="N",
|
|
399
|
+
help="number of checkpoint segments to request from supported models",
|
|
400
|
+
)
|
|
401
|
+
group.add_argument(
|
|
402
|
+
"--no-grad-checkpointing-preserve-rng-state",
|
|
403
|
+
dest="grad_checkpointing_preserve_rng_state",
|
|
404
|
+
default=True,
|
|
405
|
+
action="store_false",
|
|
406
|
+
help="disable RNG state preservation during gradient checkpointing recomputation",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
|
|
351
410
|
def add_compile_args(parser: argparse.ArgumentParser) -> None:
|
|
352
411
|
group = parser.add_argument_group("Compilation parameters")
|
|
353
412
|
group.add_argument("--compile", default=False, action="store_true", help="enable compilation")
|
|
@@ -410,6 +469,44 @@ def add_distributed_args(parser: argparse.ArgumentParser) -> None:
|
|
|
410
469
|
group.add_argument("--local-rank", type=int, metavar="N", help="local rank")
|
|
411
470
|
group.add_argument("--dist-url", type=str, default="env://", help="URL used to initialize distributed training")
|
|
412
471
|
group.add_argument("--dist-backend", type=str, default="nccl", help="distributed backend")
|
|
472
|
+
group.add_argument(
|
|
473
|
+
"--distributed-mode", type=str, choices=["ddp", "fsdp"], default="ddp", help="distributed training mode"
|
|
474
|
+
)
|
|
475
|
+
group.add_argument(
|
|
476
|
+
"--fsdp-sharding-strategy",
|
|
477
|
+
type=str,
|
|
478
|
+
choices=["shard-grad-op", "full-shard"],
|
|
479
|
+
default="shard-grad-op",
|
|
480
|
+
help="FSDP sharding strategy",
|
|
481
|
+
)
|
|
482
|
+
group.add_argument(
|
|
483
|
+
"--fsdp-param-dtype",
|
|
484
|
+
type=str,
|
|
485
|
+
choices=["float32", "float16", "bfloat16"],
|
|
486
|
+
help="FSDP mixed precision parameter dtype",
|
|
487
|
+
)
|
|
488
|
+
group.add_argument(
|
|
489
|
+
"--fsdp-reduce-dtype",
|
|
490
|
+
type=str,
|
|
491
|
+
choices=["float32", "float16", "bfloat16"],
|
|
492
|
+
help="FSDP mixed precision gradient reduction dtype",
|
|
493
|
+
)
|
|
494
|
+
group.add_argument(
|
|
495
|
+
"--fsdp-wrap-policy",
|
|
496
|
+
type=str,
|
|
497
|
+
choices=["block-group-regex", "min-num-params"],
|
|
498
|
+
default="block-group-regex",
|
|
499
|
+
help="FSDP module wrapping policy",
|
|
500
|
+
)
|
|
501
|
+
group.add_argument(
|
|
502
|
+
"--fsdp-wrap-min-num-params",
|
|
503
|
+
type=float,
|
|
504
|
+
metavar="M",
|
|
505
|
+
help="minimum module parameter count in millions for wrapping when using --fsdp-wrap-policy min-num-params",
|
|
506
|
+
)
|
|
507
|
+
group.add_argument(
|
|
508
|
+
"--fsdp-offload-policy", type=str, choices=["none", "cpu"], default="none", help="FSDP parameter offload policy"
|
|
509
|
+
)
|
|
413
510
|
group.add_argument(
|
|
414
511
|
"--find-unused-parameters",
|
|
415
512
|
default=False,
|
|
@@ -532,8 +629,12 @@ def common_args_validation(args: argparse.Namespace) -> None:
|
|
|
532
629
|
raise cli.ValidationError("--load-states requires --resume-epoch to be set")
|
|
533
630
|
if args.load_scheduler is True and args.resume_epoch is None:
|
|
534
631
|
raise cli.ValidationError("--load-scheduler requires --resume-epoch to be set")
|
|
535
|
-
if
|
|
632
|
+
if args.pretrained is True and args.resume_epoch is not None:
|
|
536
633
|
raise cli.ValidationError("--pretrained cannot be used with --resume-epoch")
|
|
634
|
+
if args.image_encoder_pretrained is not None and args.resume_epoch is not None:
|
|
635
|
+
raise cli.ValidationError("--image-encoder-pretrained cannot be used with --resume-epoch")
|
|
636
|
+
if args.pretrained is True and args.image_encoder_pretrained is not None:
|
|
637
|
+
raise cli.ValidationError("--image-encoder-pretrained cannot be used with --pretrained")
|
|
537
638
|
|
|
538
639
|
if args.freeze_bn is True and args.sync_bn is True:
|
|
539
640
|
raise cli.ValidationError("--freeze-bn cannot be used with --sync-bn")
|
|
@@ -559,5 +660,44 @@ def common_args_validation(args: argparse.Namespace) -> None:
|
|
|
559
660
|
raise cli.ValidationError("--context-length must be positive")
|
|
560
661
|
if args.grad_accum_steps < 1:
|
|
561
662
|
raise cli.ValidationError("--grad-accum-steps must be >= 1")
|
|
663
|
+
if args.grad_accum_cache_negatives is True and args.grad_accum_steps == 1:
|
|
664
|
+
raise cli.ValidationError("--grad-accum-cache-negatives requires --grad-accum-steps greater than 1")
|
|
665
|
+
if args.grad_accum_cache_negatives is True and args.loss == "coca":
|
|
666
|
+
raise cli.ValidationError("--grad-accum-cache-negatives is only supported with --loss clip")
|
|
667
|
+
|
|
668
|
+
if args.coca_caption_loss_weight < 0.0:
|
|
669
|
+
raise cli.ValidationError("--coca-caption-loss-weight must be non-negative")
|
|
670
|
+
if args.coca_contrastive_loss_weight < 0.0:
|
|
671
|
+
raise cli.ValidationError("--coca-contrastive-loss-weight must be non-negative")
|
|
672
|
+
|
|
673
|
+
# EMA
|
|
562
674
|
if args.model_ema_steps < 1:
|
|
563
675
|
raise cli.ValidationError("--model-ema-steps must be >= 1")
|
|
676
|
+
|
|
677
|
+
# Gradient checkpointing args
|
|
678
|
+
if args.grad_checkpointing_segments is not None and args.grad_checkpointing_segments < 1:
|
|
679
|
+
raise cli.ValidationError("--grad-checkpointing-segments must be >= 1")
|
|
680
|
+
if args.grad_checkpointing_segments is not None and args.grad_checkpointing is False:
|
|
681
|
+
raise cli.ValidationError("--grad-checkpointing-segments requires --grad-checkpointing")
|
|
682
|
+
|
|
683
|
+
if args.distributed_mode == "fsdp":
|
|
684
|
+
if args.grad_checkpointing is True:
|
|
685
|
+
raise cli.ValidationError("--grad-checkpointing cannot be used with --distributed-mode fsdp")
|
|
686
|
+
if args.sync_bn is True:
|
|
687
|
+
raise cli.ValidationError("--sync-bn cannot be used with --distributed-mode fsdp")
|
|
688
|
+
if args.find_unused_parameters is True:
|
|
689
|
+
raise cli.ValidationError("--find-unused-parameters cannot be used with --distributed-mode fsdp")
|
|
690
|
+
if args.compile_opt is True:
|
|
691
|
+
raise cli.ValidationError("--compile-opt cannot be used with --distributed-mode fsdp")
|
|
692
|
+
if args.compile_fullgraph is True:
|
|
693
|
+
raise cli.ValidationError("--compile-fullgraph cannot be used with --distributed-mode fsdp")
|
|
694
|
+
if args.cpu is True:
|
|
695
|
+
raise cli.ValidationError("--cpu cannot be used with --distributed-mode fsdp")
|
|
696
|
+
if args.model_ema is True:
|
|
697
|
+
raise cli.ValidationError("--model-ema cannot be used with --distributed-mode fsdp")
|
|
698
|
+
if args.fsdp_wrap_policy == "min-num-params" and args.fsdp_wrap_min_num_params is None:
|
|
699
|
+
raise cli.ValidationError(
|
|
700
|
+
"--fsdp-wrap-min-num-params is required when --fsdp-wrap-policy is min-num-params"
|
|
701
|
+
)
|
|
702
|
+
if args.fsdp_wrap_min_num_params is not None and args.fsdp_wrap_min_num_params <= 0:
|
|
703
|
+
raise cli.ValidationError("--fsdp-wrap-min-num-params must be > 0")
|