hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. cli/__main__.py +16 -3
  2. cli/config.py +45 -4
  3. cli/consts.py +1 -1
  4. cli/dataset_cmds.py +6 -14
  5. cli/dataset_recipe_cmds.py +78 -0
  6. cli/experiment_cmds.py +226 -43
  7. cli/keychain.py +88 -0
  8. cli/profile_cmds.py +10 -6
  9. cli/runc_cmds.py +5 -5
  10. cli/trainer_package_cmds.py +65 -0
  11. hafnia/__init__.py +2 -0
  12. hafnia/data/factory.py +1 -2
  13. hafnia/dataset/dataset_helpers.py +9 -14
  14. hafnia/dataset/dataset_names.py +10 -5
  15. hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
  16. hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
  17. hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
  18. hafnia/dataset/dataset_upload_helper.py +265 -56
  19. hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
  20. hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
  21. hafnia/dataset/hafnia_dataset.py +577 -213
  22. hafnia/dataset/license_types.py +63 -0
  23. hafnia/dataset/operations/dataset_stats.py +259 -3
  24. hafnia/dataset/operations/dataset_transformations.py +332 -7
  25. hafnia/dataset/operations/table_transformations.py +43 -5
  26. hafnia/dataset/primitives/__init__.py +8 -0
  27. hafnia/dataset/primitives/bbox.py +25 -12
  28. hafnia/dataset/primitives/bitmask.py +26 -14
  29. hafnia/dataset/primitives/classification.py +16 -8
  30. hafnia/dataset/primitives/point.py +7 -3
  31. hafnia/dataset/primitives/polygon.py +16 -9
  32. hafnia/dataset/primitives/segmentation.py +10 -7
  33. hafnia/experiment/hafnia_logger.py +111 -8
  34. hafnia/http.py +16 -2
  35. hafnia/platform/__init__.py +9 -3
  36. hafnia/platform/builder.py +12 -10
  37. hafnia/platform/dataset_recipe.py +104 -0
  38. hafnia/platform/datasets.py +47 -9
  39. hafnia/platform/download.py +25 -19
  40. hafnia/platform/experiment.py +51 -56
  41. hafnia/platform/trainer_package.py +57 -0
  42. hafnia/utils.py +81 -13
  43. hafnia/visualizations/image_visualizations.py +4 -4
  44. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
  45. hafnia-0.4.0.dist-info/RECORD +56 -0
  46. cli/recipe_cmds.py +0 -45
  47. hafnia-0.2.4.dist-info/RECORD +0 -49
  48. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
  49. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
  50. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,281 @@
1
+ import inspect
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ import textwrap
6
+ from pathlib import Path
7
+ from typing import Callable, Dict, List, Optional, Tuple
8
+
9
+ from rich.progress import track
10
+ from torchvision import datasets as tv_datasets
11
+ from torchvision.datasets import VisionDataset
12
+ from torchvision.datasets.utils import download_and_extract_archive, extract_archive
13
+
14
+ from hafnia import utils
15
+ from hafnia.dataset.dataset_helpers import save_pil_image_with_hash_name
16
+ from hafnia.dataset.dataset_names import SplitName
17
+ from hafnia.dataset.format_conversions.image_classification_from_directory import (
18
+ import_image_classification_directory_tree,
19
+ )
20
+ from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
21
+ from hafnia.dataset.primitives import Classification
22
+
23
+
24
+ def torchvision_to_hafnia_converters() -> Dict[str, Callable]:
25
+ return {
26
+ "mnist": mnist_as_hafnia_dataset,
27
+ "cifar10": cifar10_as_hafnia_dataset,
28
+ "cifar100": cifar100_as_hafnia_dataset,
29
+ "caltech-101": caltech_101_as_hafnia_dataset,
30
+ "caltech-256": caltech_256_as_hafnia_dataset,
31
+ }
32
+
33
+
34
+ def mnist_as_hafnia_dataset(force_redownload=False, n_samples: Optional[int] = None) -> HafniaDataset:
35
+ samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
36
+ dataset_loader=tv_datasets.MNIST,
37
+ force_redownload=force_redownload,
38
+ n_samples=n_samples,
39
+ )
40
+
41
+ dataset_info = DatasetInfo(
42
+ dataset_name="mnist",
43
+ version="1.1.0",
44
+ tasks=tasks,
45
+ reference_bibtex=textwrap.dedent("""\
46
+ @article{lecun2010mnist,
47
+ title={MNIST handwritten digit database},
48
+ author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
49
+ journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
50
+ volume={2},
51
+ year={2010}
52
+ }"""),
53
+ reference_paper_url=None,
54
+ reference_dataset_page="http://yann.lecun.com/exdb/mnist",
55
+ )
56
+ return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
57
+
58
+
59
+ def cifar10_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
60
+ return cifar_as_hafnia_dataset(dataset_name="cifar10", force_redownload=force_redownload, n_samples=n_samples)
61
+
62
+
63
+ def cifar100_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
64
+ return cifar_as_hafnia_dataset(dataset_name="cifar100", force_redownload=force_redownload, n_samples=n_samples)
65
+
66
+
67
+ def caltech_101_as_hafnia_dataset(
68
+ force_redownload: bool = False,
69
+ n_samples: Optional[int] = None,
70
+ ) -> HafniaDataset:
71
+ dataset_name = "caltech-101"
72
+ path_image_classification_folder = _download_and_extract_caltech_dataset(
73
+ dataset_name, force_redownload=force_redownload
74
+ )
75
+ hafnia_dataset = import_image_classification_directory_tree(
76
+ path_image_classification_folder,
77
+ split=SplitName.TRAIN,
78
+ n_samples=n_samples,
79
+ )
80
+ hafnia_dataset.info.dataset_name = dataset_name
81
+ hafnia_dataset.info.version = "1.1.0"
82
+ hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
83
+ @article{FeiFei2004LearningGV,
84
+ title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian
85
+ Approach Tested on 101 Object Categories},
86
+ author={Li Fei-Fei and Rob Fergus and Pietro Perona},
87
+ journal={Computer Vision and Pattern Recognition Workshop},
88
+ year={2004},
89
+ }
90
+ """)
91
+ hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/mzrjq-6wc02"
92
+
93
+ return hafnia_dataset
94
+
95
+
96
+ def caltech_256_as_hafnia_dataset(
97
+ force_redownload: bool = False,
98
+ n_samples: Optional[int] = None,
99
+ ) -> HafniaDataset:
100
+ dataset_name = "caltech-256"
101
+
102
+ path_image_classification_folder = _download_and_extract_caltech_dataset(
103
+ dataset_name, force_redownload=force_redownload
104
+ )
105
+ hafnia_dataset = import_image_classification_directory_tree(
106
+ path_image_classification_folder,
107
+ split=SplitName.TRAIN,
108
+ n_samples=n_samples,
109
+ )
110
+ hafnia_dataset.info.dataset_name = dataset_name
111
+ hafnia_dataset.info.version = "1.1.0"
112
+ hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
113
+ @misc{griffin_2023_5sv1j-ytw97,
114
+ author = {Griffin, Gregory and
115
+ Holub, Alex and
116
+ Perona, Pietro},
117
+ title = {Caltech-256 Object Category Dataset},
118
+ month = aug,
119
+ year = 2023,
120
+ publisher = {California Institute of Technology},
121
+ version = {public},
122
+ }""")
123
+ hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/nyy15-4j048"
124
+
125
+ return hafnia_dataset
126
+
127
+
128
+ def cifar_as_hafnia_dataset(
129
+ dataset_name: str,
130
+ force_redownload: bool = False,
131
+ n_samples: Optional[int] = None,
132
+ ) -> HafniaDataset:
133
+ if dataset_name == "cifar10":
134
+ dataset_loader = tv_datasets.CIFAR10
135
+ elif dataset_name == "cifar100":
136
+ dataset_loader = tv_datasets.CIFAR100
137
+ else:
138
+ raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: cifar10, cifar100")
139
+ samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
140
+ dataset_loader=dataset_loader,
141
+ force_redownload=force_redownload,
142
+ n_samples=n_samples,
143
+ )
144
+
145
+ dataset_info = DatasetInfo(
146
+ dataset_name=dataset_name,
147
+ version="1.1.0",
148
+ tasks=tasks,
149
+ reference_bibtex=textwrap.dedent("""\
150
+ @@TECHREPORT{Krizhevsky09learningmultiple,
151
+ author = {Alex Krizhevsky},
152
+ title = {Learning multiple layers of features from tiny images},
153
+ institution = {},
154
+ year = {2009}
155
+ }"""),
156
+ reference_paper_url="https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf",
157
+ reference_dataset_page="https://www.cs.toronto.edu/~kriz/cifar.html",
158
+ )
159
+
160
+ return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
161
+
162
+
163
+ def torchvision_basic_image_classification_dataset_as_hafnia_dataset(
164
+ dataset_loader: VisionDataset,
165
+ force_redownload: bool = False,
166
+ n_samples: Optional[int] = None,
167
+ ) -> Tuple[List[Sample], List[TaskInfo]]:
168
+ """
169
+ Converts a certain group of torchvision-based image classification datasets to a Hafnia Dataset.
170
+
171
+ This conversion only works for certain group of image classification VisionDataset by torchvision.
172
+ Common for these datasets is:
173
+ 1) They provide a 'class_to_idx' mapping,
174
+ 2) A "train" boolean parameter in the init function to separate training and test data - thus no validation split
175
+ is available for these datasets,
176
+ 3) Datasets are in-memory and not on disk
177
+ 4) Samples consist of a PIL image and a class index.
178
+
179
+ """
180
+ torchvision_dataset_name = dataset_loader.__name__
181
+
182
+ # Check if loader has train-parameter using inspect module
183
+ params = inspect.signature(dataset_loader).parameters
184
+
185
+ has_train_param = ("train" in params) and (params["train"].annotation is bool)
186
+ if not has_train_param:
187
+ raise ValueError(
188
+ f"The dataset loader '{dataset_loader.__name__}' does not have a 'train: bool' parameter in the init "
189
+ "function. This is a sign that the wrong dataset loader is being used. This conversion function only "
190
+ "works for certain image classification datasets provided by torchvision that are similar to e.g. "
191
+ "MNIST, CIFAR-10, CIFAR-100"
192
+ )
193
+
194
+ path_torchvision_dataset = utils.get_path_torchvision_downloads() / torchvision_dataset_name
195
+ path_hafnia_conversions = utils.get_path_hafnia_conversions() / torchvision_dataset_name
196
+
197
+ if force_redownload:
198
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
199
+ shutil.rmtree(path_hafnia_conversions, ignore_errors=True)
200
+
201
+ splits = {
202
+ SplitName.TRAIN: dataset_loader(root=path_torchvision_dataset, train=True, download=True),
203
+ SplitName.TEST: dataset_loader(root=path_torchvision_dataset, train=False, download=True),
204
+ }
205
+
206
+ samples = []
207
+ n_samples_per_split = n_samples // len(splits) if n_samples is not None else None
208
+ for split_name, torchvision_dataset in splits.items():
209
+ class_name_to_index = torchvision_dataset.class_to_idx
210
+ class_index_to_name = {v: k for k, v in class_name_to_index.items()}
211
+ description = f"Convert '{torchvision_dataset_name}' ({split_name} split) to Hafnia Dataset "
212
+ samples_in_split = []
213
+ for image, class_idx in track(torchvision_dataset, total=n_samples_per_split, description=description):
214
+ (width, height) = image.size
215
+ path_image = save_pil_image_with_hash_name(image, path_hafnia_conversions)
216
+ sample = Sample(
217
+ file_path=str(path_image),
218
+ height=height,
219
+ width=width,
220
+ split=split_name,
221
+ classifications=[
222
+ Classification(
223
+ class_name=class_index_to_name[class_idx],
224
+ class_idx=class_idx,
225
+ )
226
+ ],
227
+ )
228
+ samples_in_split.append(sample)
229
+
230
+ if n_samples_per_split is not None and len(samples_in_split) >= n_samples_per_split:
231
+ break
232
+
233
+ samples.extend(samples_in_split)
234
+ class_names = list(class_name_to_index.keys())
235
+ tasks = [TaskInfo(primitive=Classification, class_names=class_names)]
236
+
237
+ return samples, tasks
238
+
239
+
240
+ def _download_and_extract_caltech_dataset(dataset_name: str, force_redownload: bool) -> Path:
241
+ path_torchvision_dataset = utils.get_path_torchvision_downloads() / dataset_name
242
+
243
+ if force_redownload:
244
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
245
+
246
+ if path_torchvision_dataset.exists():
247
+ return path_torchvision_dataset
248
+
249
+ with tempfile.TemporaryDirectory() as tmpdirname:
250
+ path_tmp_output = Path(tmpdirname)
251
+ path_tmp_output.mkdir(parents=True, exist_ok=True)
252
+
253
+ if dataset_name == "caltech-101":
254
+ download_and_extract_archive(
255
+ "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
256
+ download_root=path_tmp_output,
257
+ filename="caltech-101.zip",
258
+ md5="3138e1922a9193bfa496528edbbc45d0",
259
+ )
260
+ path_output_extracted = path_tmp_output / "caltech-101"
261
+ for gzip_file in os.listdir(path_output_extracted):
262
+ if gzip_file.endswith(".gz"):
263
+ extract_archive(os.path.join(path_output_extracted, gzip_file), path_output_extracted)
264
+ path_org = path_output_extracted / "101_ObjectCategories"
265
+
266
+ elif dataset_name == "caltech-256":
267
+ org_dataset_name = "256_ObjectCategories"
268
+ path_org = path_tmp_output / org_dataset_name
269
+ download_and_extract_archive(
270
+ url=f"https://data.caltech.edu/records/nyy15-4j048/files/{org_dataset_name}.tar",
271
+ download_root=path_tmp_output,
272
+ md5="67b4f42ca05d46448c6bb8ecd2220f6d",
273
+ remove_finished=True,
274
+ )
275
+
276
+ else:
277
+ raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: caltech-101, caltech-256")
278
+
279
+ shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
280
+ shutil.move(path_org, path_torchvision_dataset)
281
+ return path_torchvision_dataset