hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +16 -3
- cli/config.py +45 -4
- cli/consts.py +1 -1
- cli/dataset_cmds.py +6 -14
- cli/dataset_recipe_cmds.py +78 -0
- cli/experiment_cmds.py +226 -43
- cli/keychain.py +88 -0
- cli/profile_cmds.py +10 -6
- cli/runc_cmds.py +5 -5
- cli/trainer_package_cmds.py +65 -0
- hafnia/__init__.py +2 -0
- hafnia/data/factory.py +1 -2
- hafnia/dataset/dataset_helpers.py +9 -14
- hafnia/dataset/dataset_names.py +10 -5
- hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
- hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
- hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
- hafnia/dataset/dataset_upload_helper.py +265 -56
- hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
- hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
- hafnia/dataset/hafnia_dataset.py +577 -213
- hafnia/dataset/license_types.py +63 -0
- hafnia/dataset/operations/dataset_stats.py +259 -3
- hafnia/dataset/operations/dataset_transformations.py +332 -7
- hafnia/dataset/operations/table_transformations.py +43 -5
- hafnia/dataset/primitives/__init__.py +8 -0
- hafnia/dataset/primitives/bbox.py +25 -12
- hafnia/dataset/primitives/bitmask.py +26 -14
- hafnia/dataset/primitives/classification.py +16 -8
- hafnia/dataset/primitives/point.py +7 -3
- hafnia/dataset/primitives/polygon.py +16 -9
- hafnia/dataset/primitives/segmentation.py +10 -7
- hafnia/experiment/hafnia_logger.py +111 -8
- hafnia/http.py +16 -2
- hafnia/platform/__init__.py +9 -3
- hafnia/platform/builder.py +12 -10
- hafnia/platform/dataset_recipe.py +104 -0
- hafnia/platform/datasets.py +47 -9
- hafnia/platform/download.py +25 -19
- hafnia/platform/experiment.py +51 -56
- hafnia/platform/trainer_package.py +57 -0
- hafnia/utils.py +81 -13
- hafnia/visualizations/image_visualizations.py +4 -4
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
- hafnia-0.4.0.dist-info/RECORD +56 -0
- cli/recipe_cmds.py +0 -45
- hafnia-0.2.4.dist-info/RECORD +0 -49
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import tempfile
|
|
5
|
+
import textwrap
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from rich.progress import track
|
|
10
|
+
from torchvision import datasets as tv_datasets
|
|
11
|
+
from torchvision.datasets import VisionDataset
|
|
12
|
+
from torchvision.datasets.utils import download_and_extract_archive, extract_archive
|
|
13
|
+
|
|
14
|
+
from hafnia import utils
|
|
15
|
+
from hafnia.dataset.dataset_helpers import save_pil_image_with_hash_name
|
|
16
|
+
from hafnia.dataset.dataset_names import SplitName
|
|
17
|
+
from hafnia.dataset.format_conversions.image_classification_from_directory import (
|
|
18
|
+
import_image_classification_directory_tree,
|
|
19
|
+
)
|
|
20
|
+
from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
|
|
21
|
+
from hafnia.dataset.primitives import Classification
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def torchvision_to_hafnia_converters() -> Dict[str, Callable]:
|
|
25
|
+
return {
|
|
26
|
+
"mnist": mnist_as_hafnia_dataset,
|
|
27
|
+
"cifar10": cifar10_as_hafnia_dataset,
|
|
28
|
+
"cifar100": cifar100_as_hafnia_dataset,
|
|
29
|
+
"caltech-101": caltech_101_as_hafnia_dataset,
|
|
30
|
+
"caltech-256": caltech_256_as_hafnia_dataset,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def mnist_as_hafnia_dataset(force_redownload=False, n_samples: Optional[int] = None) -> HafniaDataset:
|
|
35
|
+
samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
|
|
36
|
+
dataset_loader=tv_datasets.MNIST,
|
|
37
|
+
force_redownload=force_redownload,
|
|
38
|
+
n_samples=n_samples,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
dataset_info = DatasetInfo(
|
|
42
|
+
dataset_name="mnist",
|
|
43
|
+
version="1.1.0",
|
|
44
|
+
tasks=tasks,
|
|
45
|
+
reference_bibtex=textwrap.dedent("""\
|
|
46
|
+
@article{lecun2010mnist,
|
|
47
|
+
title={MNIST handwritten digit database},
|
|
48
|
+
author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
|
|
49
|
+
journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
|
|
50
|
+
volume={2},
|
|
51
|
+
year={2010}
|
|
52
|
+
}"""),
|
|
53
|
+
reference_paper_url=None,
|
|
54
|
+
reference_dataset_page="http://yann.lecun.com/exdb/mnist",
|
|
55
|
+
)
|
|
56
|
+
return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def cifar10_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
|
|
60
|
+
return cifar_as_hafnia_dataset(dataset_name="cifar10", force_redownload=force_redownload, n_samples=n_samples)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def cifar100_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
|
|
64
|
+
return cifar_as_hafnia_dataset(dataset_name="cifar100", force_redownload=force_redownload, n_samples=n_samples)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def caltech_101_as_hafnia_dataset(
|
|
68
|
+
force_redownload: bool = False,
|
|
69
|
+
n_samples: Optional[int] = None,
|
|
70
|
+
) -> HafniaDataset:
|
|
71
|
+
dataset_name = "caltech-101"
|
|
72
|
+
path_image_classification_folder = _download_and_extract_caltech_dataset(
|
|
73
|
+
dataset_name, force_redownload=force_redownload
|
|
74
|
+
)
|
|
75
|
+
hafnia_dataset = import_image_classification_directory_tree(
|
|
76
|
+
path_image_classification_folder,
|
|
77
|
+
split=SplitName.TRAIN,
|
|
78
|
+
n_samples=n_samples,
|
|
79
|
+
)
|
|
80
|
+
hafnia_dataset.info.dataset_name = dataset_name
|
|
81
|
+
hafnia_dataset.info.version = "1.1.0"
|
|
82
|
+
hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
|
|
83
|
+
@article{FeiFei2004LearningGV,
|
|
84
|
+
title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian
|
|
85
|
+
Approach Tested on 101 Object Categories},
|
|
86
|
+
author={Li Fei-Fei and Rob Fergus and Pietro Perona},
|
|
87
|
+
journal={Computer Vision and Pattern Recognition Workshop},
|
|
88
|
+
year={2004},
|
|
89
|
+
}
|
|
90
|
+
""")
|
|
91
|
+
hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/mzrjq-6wc02"
|
|
92
|
+
|
|
93
|
+
return hafnia_dataset
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def caltech_256_as_hafnia_dataset(
|
|
97
|
+
force_redownload: bool = False,
|
|
98
|
+
n_samples: Optional[int] = None,
|
|
99
|
+
) -> HafniaDataset:
|
|
100
|
+
dataset_name = "caltech-256"
|
|
101
|
+
|
|
102
|
+
path_image_classification_folder = _download_and_extract_caltech_dataset(
|
|
103
|
+
dataset_name, force_redownload=force_redownload
|
|
104
|
+
)
|
|
105
|
+
hafnia_dataset = import_image_classification_directory_tree(
|
|
106
|
+
path_image_classification_folder,
|
|
107
|
+
split=SplitName.TRAIN,
|
|
108
|
+
n_samples=n_samples,
|
|
109
|
+
)
|
|
110
|
+
hafnia_dataset.info.dataset_name = dataset_name
|
|
111
|
+
hafnia_dataset.info.version = "1.1.0"
|
|
112
|
+
hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
|
|
113
|
+
@misc{griffin_2023_5sv1j-ytw97,
|
|
114
|
+
author = {Griffin, Gregory and
|
|
115
|
+
Holub, Alex and
|
|
116
|
+
Perona, Pietro},
|
|
117
|
+
title = {Caltech-256 Object Category Dataset},
|
|
118
|
+
month = aug,
|
|
119
|
+
year = 2023,
|
|
120
|
+
publisher = {California Institute of Technology},
|
|
121
|
+
version = {public},
|
|
122
|
+
}""")
|
|
123
|
+
hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/nyy15-4j048"
|
|
124
|
+
|
|
125
|
+
return hafnia_dataset
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cifar_as_hafnia_dataset(
|
|
129
|
+
dataset_name: str,
|
|
130
|
+
force_redownload: bool = False,
|
|
131
|
+
n_samples: Optional[int] = None,
|
|
132
|
+
) -> HafniaDataset:
|
|
133
|
+
if dataset_name == "cifar10":
|
|
134
|
+
dataset_loader = tv_datasets.CIFAR10
|
|
135
|
+
elif dataset_name == "cifar100":
|
|
136
|
+
dataset_loader = tv_datasets.CIFAR100
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: cifar10, cifar100")
|
|
139
|
+
samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
|
|
140
|
+
dataset_loader=dataset_loader,
|
|
141
|
+
force_redownload=force_redownload,
|
|
142
|
+
n_samples=n_samples,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
dataset_info = DatasetInfo(
|
|
146
|
+
dataset_name=dataset_name,
|
|
147
|
+
version="1.1.0",
|
|
148
|
+
tasks=tasks,
|
|
149
|
+
reference_bibtex=textwrap.dedent("""\
|
|
150
|
+
@@TECHREPORT{Krizhevsky09learningmultiple,
|
|
151
|
+
author = {Alex Krizhevsky},
|
|
152
|
+
title = {Learning multiple layers of features from tiny images},
|
|
153
|
+
institution = {},
|
|
154
|
+
year = {2009}
|
|
155
|
+
}"""),
|
|
156
|
+
reference_paper_url="https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf",
|
|
157
|
+
reference_dataset_page="https://www.cs.toronto.edu/~kriz/cifar.html",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def torchvision_basic_image_classification_dataset_as_hafnia_dataset(
|
|
164
|
+
dataset_loader: VisionDataset,
|
|
165
|
+
force_redownload: bool = False,
|
|
166
|
+
n_samples: Optional[int] = None,
|
|
167
|
+
) -> Tuple[List[Sample], List[TaskInfo]]:
|
|
168
|
+
"""
|
|
169
|
+
Converts a certain group of torchvision-based image classification datasets to a Hafnia Dataset.
|
|
170
|
+
|
|
171
|
+
This conversion only works for certain group of image classification VisionDataset by torchvision.
|
|
172
|
+
Common for these datasets is:
|
|
173
|
+
1) They provide a 'class_to_idx' mapping,
|
|
174
|
+
2) A "train" boolean parameter in the init function to separate training and test data - thus no validation split
|
|
175
|
+
is available for these datasets,
|
|
176
|
+
3) Datasets are in-memory and not on disk
|
|
177
|
+
4) Samples consist of a PIL image and a class index.
|
|
178
|
+
|
|
179
|
+
"""
|
|
180
|
+
torchvision_dataset_name = dataset_loader.__name__
|
|
181
|
+
|
|
182
|
+
# Check if loader has train-parameter using inspect module
|
|
183
|
+
params = inspect.signature(dataset_loader).parameters
|
|
184
|
+
|
|
185
|
+
has_train_param = ("train" in params) and (params["train"].annotation is bool)
|
|
186
|
+
if not has_train_param:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
f"The dataset loader '{dataset_loader.__name__}' does not have a 'train: bool' parameter in the init "
|
|
189
|
+
"function. This is a sign that the wrong dataset loader is being used. This conversion function only "
|
|
190
|
+
"works for certain image classification datasets provided by torchvision that are similar to e.g. "
|
|
191
|
+
"MNIST, CIFAR-10, CIFAR-100"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
path_torchvision_dataset = utils.get_path_torchvision_downloads() / torchvision_dataset_name
|
|
195
|
+
path_hafnia_conversions = utils.get_path_hafnia_conversions() / torchvision_dataset_name
|
|
196
|
+
|
|
197
|
+
if force_redownload:
|
|
198
|
+
shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
|
|
199
|
+
shutil.rmtree(path_hafnia_conversions, ignore_errors=True)
|
|
200
|
+
|
|
201
|
+
splits = {
|
|
202
|
+
SplitName.TRAIN: dataset_loader(root=path_torchvision_dataset, train=True, download=True),
|
|
203
|
+
SplitName.TEST: dataset_loader(root=path_torchvision_dataset, train=False, download=True),
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
samples = []
|
|
207
|
+
n_samples_per_split = n_samples // len(splits) if n_samples is not None else None
|
|
208
|
+
for split_name, torchvision_dataset in splits.items():
|
|
209
|
+
class_name_to_index = torchvision_dataset.class_to_idx
|
|
210
|
+
class_index_to_name = {v: k for k, v in class_name_to_index.items()}
|
|
211
|
+
description = f"Convert '{torchvision_dataset_name}' ({split_name} split) to Hafnia Dataset "
|
|
212
|
+
samples_in_split = []
|
|
213
|
+
for image, class_idx in track(torchvision_dataset, total=n_samples_per_split, description=description):
|
|
214
|
+
(width, height) = image.size
|
|
215
|
+
path_image = save_pil_image_with_hash_name(image, path_hafnia_conversions)
|
|
216
|
+
sample = Sample(
|
|
217
|
+
file_path=str(path_image),
|
|
218
|
+
height=height,
|
|
219
|
+
width=width,
|
|
220
|
+
split=split_name,
|
|
221
|
+
classifications=[
|
|
222
|
+
Classification(
|
|
223
|
+
class_name=class_index_to_name[class_idx],
|
|
224
|
+
class_idx=class_idx,
|
|
225
|
+
)
|
|
226
|
+
],
|
|
227
|
+
)
|
|
228
|
+
samples_in_split.append(sample)
|
|
229
|
+
|
|
230
|
+
if n_samples_per_split is not None and len(samples_in_split) >= n_samples_per_split:
|
|
231
|
+
break
|
|
232
|
+
|
|
233
|
+
samples.extend(samples_in_split)
|
|
234
|
+
class_names = list(class_name_to_index.keys())
|
|
235
|
+
tasks = [TaskInfo(primitive=Classification, class_names=class_names)]
|
|
236
|
+
|
|
237
|
+
return samples, tasks
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _download_and_extract_caltech_dataset(dataset_name: str, force_redownload: bool) -> Path:
|
|
241
|
+
path_torchvision_dataset = utils.get_path_torchvision_downloads() / dataset_name
|
|
242
|
+
|
|
243
|
+
if force_redownload:
|
|
244
|
+
shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
|
|
245
|
+
|
|
246
|
+
if path_torchvision_dataset.exists():
|
|
247
|
+
return path_torchvision_dataset
|
|
248
|
+
|
|
249
|
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
250
|
+
path_tmp_output = Path(tmpdirname)
|
|
251
|
+
path_tmp_output.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
|
|
253
|
+
if dataset_name == "caltech-101":
|
|
254
|
+
download_and_extract_archive(
|
|
255
|
+
"https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
|
|
256
|
+
download_root=path_tmp_output,
|
|
257
|
+
filename="caltech-101.zip",
|
|
258
|
+
md5="3138e1922a9193bfa496528edbbc45d0",
|
|
259
|
+
)
|
|
260
|
+
path_output_extracted = path_tmp_output / "caltech-101"
|
|
261
|
+
for gzip_file in os.listdir(path_output_extracted):
|
|
262
|
+
if gzip_file.endswith(".gz"):
|
|
263
|
+
extract_archive(os.path.join(path_output_extracted, gzip_file), path_output_extracted)
|
|
264
|
+
path_org = path_output_extracted / "101_ObjectCategories"
|
|
265
|
+
|
|
266
|
+
elif dataset_name == "caltech-256":
|
|
267
|
+
org_dataset_name = "256_ObjectCategories"
|
|
268
|
+
path_org = path_tmp_output / org_dataset_name
|
|
269
|
+
download_and_extract_archive(
|
|
270
|
+
url=f"https://data.caltech.edu/records/nyy15-4j048/files/{org_dataset_name}.tar",
|
|
271
|
+
download_root=path_tmp_output,
|
|
272
|
+
md5="67b4f42ca05d46448c6bb8ecd2220f6d",
|
|
273
|
+
remove_finished=True,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
else:
|
|
277
|
+
raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: caltech-101, caltech-256")
|
|
278
|
+
|
|
279
|
+
shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
|
|
280
|
+
shutil.move(path_org, path_torchvision_dataset)
|
|
281
|
+
return path_torchvision_dataset
|