hafnia 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. cli/__main__.py +3 -1
  2. cli/config.py +43 -3
  3. cli/keychain.py +88 -0
  4. cli/profile_cmds.py +5 -2
  5. hafnia/__init__.py +1 -1
  6. hafnia/dataset/dataset_helpers.py +9 -2
  7. hafnia/dataset/dataset_names.py +130 -16
  8. hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
  9. hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
  10. hafnia/dataset/dataset_upload_helper.py +83 -22
  11. hafnia/dataset/format_conversions/format_image_classification_folder.py +110 -0
  12. hafnia/dataset/format_conversions/format_yolo.py +164 -0
  13. hafnia/dataset/format_conversions/torchvision_datasets.py +287 -0
  14. hafnia/dataset/hafnia_dataset.py +396 -96
  15. hafnia/dataset/operations/dataset_stats.py +84 -73
  16. hafnia/dataset/operations/dataset_transformations.py +116 -47
  17. hafnia/dataset/operations/table_transformations.py +135 -17
  18. hafnia/dataset/primitives/bbox.py +25 -14
  19. hafnia/dataset/primitives/bitmask.py +22 -15
  20. hafnia/dataset/primitives/classification.py +16 -8
  21. hafnia/dataset/primitives/point.py +7 -3
  22. hafnia/dataset/primitives/polygon.py +15 -10
  23. hafnia/dataset/primitives/primitive.py +1 -1
  24. hafnia/dataset/primitives/segmentation.py +12 -9
  25. hafnia/experiment/hafnia_logger.py +0 -9
  26. hafnia/platform/dataset_recipe.py +7 -2
  27. hafnia/platform/datasets.py +5 -9
  28. hafnia/platform/download.py +24 -90
  29. hafnia/torch_helpers.py +12 -12
  30. hafnia/utils.py +17 -0
  31. hafnia/visualizations/image_visualizations.py +3 -1
  32. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/METADATA +11 -9
  33. hafnia-0.4.1.dist-info/RECORD +57 -0
  34. hafnia-0.3.0.dist-info/RECORD +0 -53
  35. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/WHEEL +0 -0
  36. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/entry_points.txt +0 -0
  37. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,287 @@
1
+ import inspect
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ import textwrap
6
+ from pathlib import Path
7
+ from typing import Callable, Dict, List, Optional, Tuple
8
+
9
+ from rich.progress import track
10
+ from torchvision import datasets as tv_datasets
11
+ from torchvision.datasets import VisionDataset
12
+ from torchvision.datasets.utils import download_and_extract_archive, extract_archive
13
+
14
+ from hafnia import utils
15
+ from hafnia.dataset.dataset_helpers import save_pil_image_with_hash_name
16
+ from hafnia.dataset.dataset_names import SplitName
17
+ from hafnia.dataset.format_conversions.format_image_classification_folder import (
18
+ from_image_classification_folder,
19
+ )
20
+ from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
21
+ from hafnia.dataset.primitives import Classification
22
+
23
+
24
+ def torchvision_to_hafnia_converters() -> Dict[str, Callable]:
25
+ return {
26
+ "mnist": mnist_as_hafnia_dataset,
27
+ "cifar10": cifar10_as_hafnia_dataset,
28
+ "cifar100": cifar100_as_hafnia_dataset,
29
+ "caltech-101": caltech_101_as_hafnia_dataset,
30
+ "caltech-256": caltech_256_as_hafnia_dataset,
31
+ }
32
+
33
+
34
+ def mnist_as_hafnia_dataset(force_redownload=False, n_samples: Optional[int] = None) -> HafniaDataset:
35
+ samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
36
+ dataset_loader=tv_datasets.MNIST,
37
+ force_redownload=force_redownload,
38
+ n_samples=n_samples,
39
+ )
40
+
41
+ dataset_info = DatasetInfo(
42
+ dataset_name="mnist",
43
+ version="1.1.0",
44
+ tasks=tasks,
45
+ reference_bibtex=textwrap.dedent("""\
46
+ @article{lecun2010mnist,
47
+ title={MNIST handwritten digit database},
48
+ author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
49
+ journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
50
+ volume={2},
51
+ year={2010}
52
+ }"""),
53
+ reference_paper_url=None,
54
+ reference_dataset_page="http://yann.lecun.com/exdb/mnist",
55
+ )
56
+ return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
57
+
58
+
59
+ def cifar10_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
60
+ return cifar_as_hafnia_dataset(dataset_name="cifar10", force_redownload=force_redownload, n_samples=n_samples)
61
+
62
+
63
+ def cifar100_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
64
+ return cifar_as_hafnia_dataset(dataset_name="cifar100", force_redownload=force_redownload, n_samples=n_samples)
65
+
66
+
67
+ def caltech_101_as_hafnia_dataset(
68
+ force_redownload: bool = False,
69
+ n_samples: Optional[int] = None,
70
+ ) -> HafniaDataset:
71
+ dataset_name = "caltech-101"
72
+ path_image_classification_folder = _download_and_extract_caltech_dataset(
73
+ dataset_name, force_redownload=force_redownload
74
+ )
75
+ hafnia_dataset = from_image_classification_folder(
76
+ path_image_classification_folder,
77
+ split=SplitName.TRAIN,
78
+ n_samples=n_samples,
79
+ )
80
+ hafnia_dataset.info.dataset_name = dataset_name
81
+ hafnia_dataset.info.version = "1.1.0"
82
+ hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
83
+ @article{FeiFei2004LearningGV,
84
+ title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian
85
+ Approach Tested on 101 Object Categories},
86
+ author={Li Fei-Fei and Rob Fergus and Pietro Perona},
87
+ journal={Computer Vision and Pattern Recognition Workshop},
88
+ year={2004},
89
+ }
90
+ """)
91
+ hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/mzrjq-6wc02"
92
+
93
+ return hafnia_dataset
94
+
95
+
96
+ def caltech_256_as_hafnia_dataset(
97
+ force_redownload: bool = False,
98
+ n_samples: Optional[int] = None,
99
+ ) -> HafniaDataset:
100
+ dataset_name = "caltech-256"
101
+
102
+ path_image_classification_folder = _download_and_extract_caltech_dataset(
103
+ dataset_name, force_redownload=force_redownload
104
+ )
105
+ hafnia_dataset = from_image_classification_folder(
106
+ path_image_classification_folder,
107
+ split=SplitName.TRAIN,
108
+ n_samples=n_samples,
109
+ )
110
+ hafnia_dataset.info.dataset_name = dataset_name
111
+ hafnia_dataset.info.version = "1.1.0"
112
+ hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
113
+ @misc{griffin_2023_5sv1j-ytw97,
114
+ author = {Griffin, Gregory and
115
+ Holub, Alex and
116
+ Perona, Pietro},
117
+ title = {Caltech-256 Object Category Dataset},
118
+ month = aug,
119
+ year = 2023,
120
+ publisher = {California Institute of Technology},
121
+ version = {public},
122
+ }""")
123
+ hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/nyy15-4j048"
124
+
125
+ task = hafnia_dataset.info.get_task_by_primitive(Classification)
126
+
127
+ # Class Mapping: To remove numeric prefixes from class names
128
+ # E.g. "001.ak47 --> ak47", "002.american-flag --> american-flag", ...
129
+ class_mapping = {name: name.split(".")[-1] for name in task.class_names or []}
130
+ hafnia_dataset = hafnia_dataset.class_mapper(class_mapping=class_mapping, task_name=task.name)
131
+ return hafnia_dataset
132
+
133
+
134
+ def cifar_as_hafnia_dataset(
135
+ dataset_name: str,
136
+ force_redownload: bool = False,
137
+ n_samples: Optional[int] = None,
138
+ ) -> HafniaDataset:
139
+ if dataset_name == "cifar10":
140
+ dataset_loader = tv_datasets.CIFAR10
141
+ elif dataset_name == "cifar100":
142
+ dataset_loader = tv_datasets.CIFAR100
143
+ else:
144
+ raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: cifar10, cifar100")
145
+ samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
146
+ dataset_loader=dataset_loader,
147
+ force_redownload=force_redownload,
148
+ n_samples=n_samples,
149
+ )
150
+
151
+ dataset_info = DatasetInfo(
152
+ dataset_name=dataset_name,
153
+ version="1.1.0",
154
+ tasks=tasks,
155
+ reference_bibtex=textwrap.dedent("""\
156
+ @@TECHREPORT{Krizhevsky09learningmultiple,
157
+ author = {Alex Krizhevsky},
158
+ title = {Learning multiple layers of features from tiny images},
159
+ institution = {},
160
+ year = {2009}
161
+ }"""),
162
+ reference_paper_url="https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf",
163
+ reference_dataset_page="https://www.cs.toronto.edu/~kriz/cifar.html",
164
+ )
165
+
166
+ return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
167
+
168
+
169
+ def torchvision_basic_image_classification_dataset_as_hafnia_dataset(
170
+ dataset_loader: VisionDataset,
171
+ force_redownload: bool = False,
172
+ n_samples: Optional[int] = None,
173
+ ) -> Tuple[List[Sample], List[TaskInfo]]:
174
+ """
175
+ Converts a certain group of torchvision-based image classification datasets to a Hafnia Dataset.
176
+
177
+ This conversion only works for certain group of image classification VisionDataset by torchvision.
178
+ Common for these datasets is:
179
+ 1) They provide a 'class_to_idx' mapping,
180
+ 2) A "train" boolean parameter in the init function to separate training and test data - thus no validation split
181
+ is available for these datasets,
182
+ 3) Datasets are in-memory and not on disk
183
+ 4) Samples consist of a PIL image and a class index.
184
+
185
+ """
186
+ torchvision_dataset_name = dataset_loader.__name__
187
+
188
+ # Check if loader has train-parameter using inspect module
189
+ params = inspect.signature(dataset_loader).parameters
190
+
191
+ has_train_param = ("train" in params) and (params["train"].annotation is bool)
192
+ if not has_train_param:
193
+ raise ValueError(
194
+ f"The dataset loader '{dataset_loader.__name__}' does not have a 'train: bool' parameter in the init "
195
+ "function. This is a sign that the wrong dataset loader is being used. This conversion function only "
196
+ "works for certain image classification datasets provided by torchvision that are similar to e.g. "
197
+ "MNIST, CIFAR-10, CIFAR-100"
198
+ )
199
+
200
+ path_torchvision_dataset = utils.get_path_torchvision_downloads() / torchvision_dataset_name
201
+ path_hafnia_conversions = utils.get_path_hafnia_conversions() / torchvision_dataset_name
202
+
203
+ if force_redownload:
204
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
205
+ shutil.rmtree(path_hafnia_conversions, ignore_errors=True)
206
+
207
+ splits = {
208
+ SplitName.TRAIN: dataset_loader(root=path_torchvision_dataset, train=True, download=True),
209
+ SplitName.TEST: dataset_loader(root=path_torchvision_dataset, train=False, download=True),
210
+ }
211
+
212
+ samples = []
213
+ n_samples_per_split = n_samples // len(splits) if n_samples is not None else None
214
+ for split_name, torchvision_dataset in splits.items():
215
+ class_name_to_index = torchvision_dataset.class_to_idx
216
+ class_index_to_name = {v: k for k, v in class_name_to_index.items()}
217
+ description = f"Convert '{torchvision_dataset_name}' ({split_name} split) to Hafnia Dataset "
218
+ samples_in_split = []
219
+ for image, class_idx in track(torchvision_dataset, total=n_samples_per_split, description=description):
220
+ (width, height) = image.size
221
+ path_image = save_pil_image_with_hash_name(image, path_hafnia_conversions)
222
+ sample = Sample(
223
+ file_path=str(path_image),
224
+ height=height,
225
+ width=width,
226
+ split=split_name,
227
+ classifications=[
228
+ Classification(
229
+ class_name=class_index_to_name[class_idx],
230
+ class_idx=class_idx,
231
+ )
232
+ ],
233
+ )
234
+ samples_in_split.append(sample)
235
+
236
+ if n_samples_per_split is not None and len(samples_in_split) >= n_samples_per_split:
237
+ break
238
+
239
+ samples.extend(samples_in_split)
240
+ class_names = list(class_name_to_index.keys())
241
+ tasks = [TaskInfo(primitive=Classification, class_names=class_names)]
242
+
243
+ return samples, tasks
244
+
245
+
246
+ def _download_and_extract_caltech_dataset(dataset_name: str, force_redownload: bool) -> Path:
247
+ path_torchvision_dataset = utils.get_path_torchvision_downloads() / dataset_name
248
+
249
+ if force_redownload:
250
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
251
+
252
+ if path_torchvision_dataset.exists():
253
+ return path_torchvision_dataset
254
+
255
+ with tempfile.TemporaryDirectory() as tmpdirname:
256
+ path_tmp_output = Path(tmpdirname)
257
+ path_tmp_output.mkdir(parents=True, exist_ok=True)
258
+
259
+ if dataset_name == "caltech-101":
260
+ download_and_extract_archive(
261
+ "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
262
+ download_root=path_tmp_output,
263
+ filename="caltech-101.zip",
264
+ md5="3138e1922a9193bfa496528edbbc45d0",
265
+ )
266
+ path_output_extracted = path_tmp_output / "caltech-101"
267
+ for gzip_file in os.listdir(path_output_extracted):
268
+ if gzip_file.endswith(".gz"):
269
+ extract_archive(os.path.join(path_output_extracted, gzip_file), path_output_extracted)
270
+ path_org = path_output_extracted / "101_ObjectCategories"
271
+
272
+ elif dataset_name == "caltech-256":
273
+ org_dataset_name = "256_ObjectCategories"
274
+ path_org = path_tmp_output / org_dataset_name
275
+ download_and_extract_archive(
276
+ url=f"https://data.caltech.edu/records/nyy15-4j048/files/{org_dataset_name}.tar",
277
+ download_root=path_tmp_output,
278
+ md5="67b4f42ca05d46448c6bb8ecd2220f6d",
279
+ remove_finished=True,
280
+ )
281
+
282
+ else:
283
+ raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: caltech-101, caltech-256")
284
+
285
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
286
+ shutil.move(path_org, path_torchvision_dataset)
287
+ return path_torchvision_dataset