hafnia 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} +148 -238
  2. hafnia/dataset/dataset_helpers.py +1 -15
  3. hafnia/dataset/dataset_names.py +43 -3
  4. hafnia/dataset/format_conversions/format_coco.py +490 -0
  5. hafnia/dataset/format_conversions/format_helpers.py +33 -0
  6. hafnia/dataset/format_conversions/format_image_classification_folder.py +95 -14
  7. hafnia/dataset/format_conversions/format_yolo.py +115 -25
  8. hafnia/dataset/format_conversions/torchvision_datasets.py +16 -11
  9. hafnia/dataset/hafnia_dataset.py +119 -490
  10. hafnia/dataset/hafnia_dataset_types.py +479 -0
  11. hafnia/dataset/license_types.py +4 -4
  12. hafnia/dataset/operations/dataset_s3_storage.py +211 -0
  13. hafnia/dataset/operations/dataset_stats.py +3 -3
  14. hafnia/dataset/operations/dataset_transformations.py +14 -17
  15. hafnia/dataset/operations/table_transformations.py +22 -14
  16. hafnia/dataset/primitives/bbox.py +6 -2
  17. hafnia/dataset/primitives/bitmask.py +21 -46
  18. hafnia/dataset/primitives/classification.py +1 -1
  19. hafnia/dataset/primitives/polygon.py +43 -2
  20. hafnia/dataset/primitives/primitive.py +1 -1
  21. hafnia/dataset/primitives/segmentation.py +1 -1
  22. hafnia/experiment/hafnia_logger.py +13 -4
  23. hafnia/http.py +2 -1
  24. hafnia/platform/datasets.py +195 -105
  25. hafnia/platform/s5cmd_utils.py +147 -0
  26. hafnia/torch_helpers.py +48 -4
  27. hafnia/utils.py +38 -0
  28. hafnia/visualizations/image_visualizations.py +3 -1
  29. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/METADATA +4 -4
  30. hafnia-0.5.0.dist-info/RECORD +62 -0
  31. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/WHEEL +1 -1
  32. hafnia_cli/dataset_cmds.py +18 -0
  33. hafnia_cli/profile_cmds.py +0 -1
  34. hafnia-0.4.2.dist-info/RECORD +0 -57
  35. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/entry_points.txt +0 -0
  36. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,490 @@
1
+ import json
2
+ import shutil
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
7
+
8
+ import polars as pl
9
+ from pycocotools import mask as coco_utils
10
+
11
+ from hafnia.dataset import license_types
12
+ from hafnia.dataset.dataset_names import SampleField, SplitName
13
+ from hafnia.dataset.format_conversions import format_coco, format_helpers
14
+ from hafnia.utils import progress_bar
15
+
16
+ if TYPE_CHECKING: # Using 'TYPE_CHECKING' to avoid circular imports during type checking
17
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
18
+
19
+ from hafnia.dataset.hafnia_dataset_types import Attribution, DatasetInfo, License, Sample, TaskInfo
20
+ from hafnia.dataset.primitives import Bbox, Bitmask
21
+ from hafnia.log import user_logger
22
+
23
+ COCO_KEY_FILE_NAME = "file_name"
24
+
25
+ HAFNIA_TO_ROBOFLOW_SPLIT_NAME = {
26
+ SplitName.TRAIN: "train",
27
+ SplitName.VAL: "valid",
28
+ SplitName.TEST: "test",
29
+ }
30
+ ROBOFLOW_ANNOTATION_FILE_NAME = "_annotations.coco.json"
31
+
32
+
33
+ @dataclass
34
+ class CocoSplitPaths:
35
+ split: str
36
+ path_images: Path
37
+ path_instances_json: Path
38
+
39
+
40
+ def from_coco_format(
41
+ path_dataset: Path,
42
+ coco_format_type: str = "roboflow",
43
+ max_samples: Optional[int] = None,
44
+ dataset_name: str = "coco-2017",
45
+ ):
46
+ split_definitions = get_split_paths_for_coco_dataset_formats(
47
+ path_dataset=path_dataset, coco_format_type=coco_format_type
48
+ )
49
+
50
+ hafnia_dataset = from_coco_dataset_by_split_definitions(
51
+ split_definitions=split_definitions,
52
+ max_samples=max_samples,
53
+ dataset_name=dataset_name,
54
+ )
55
+
56
+ return hafnia_dataset
57
+
58
+
59
+ def get_split_paths_for_coco_dataset_formats(
60
+ path_dataset: Path,
61
+ coco_format_type: str,
62
+ ) -> List[CocoSplitPaths]:
63
+ splits = []
64
+ if coco_format_type == "roboflow":
65
+ for split_def in format_helpers.get_splits_from_folder(path_dataset):
66
+ splits.append(
67
+ CocoSplitPaths(
68
+ split=split_def.name,
69
+ path_images=split_def.path,
70
+ path_instances_json=split_def.path / ROBOFLOW_ANNOTATION_FILE_NAME,
71
+ )
72
+ )
73
+ return splits
74
+
75
+ raise ValueError(f"The specified '{coco_format_type=}' is not supported.")
76
+
77
+
78
+ def from_coco_dataset_by_split_definitions(
79
+ split_definitions: List[CocoSplitPaths],
80
+ max_samples: Optional[int],
81
+ dataset_name: str,
82
+ ) -> "HafniaDataset":
83
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
84
+
85
+ if max_samples is None:
86
+ max_samples_per_split = None
87
+ else:
88
+ max_samples_per_split = max_samples // len(split_definitions)
89
+ samples = []
90
+ tasks: List[TaskInfo] = []
91
+ for split_definition in split_definitions:
92
+ if split_definition.path_instances_json is None or not split_definition.path_instances_json.exists():
93
+ raise FileNotFoundError(
94
+ f"Expected COCO dataset files not found for split '{split_definition.split}'. "
95
+ f"Label file doesn't exist: {split_definition.path_instances_json}"
96
+ )
97
+ if not split_definition.path_images.exists():
98
+ raise FileNotFoundError(
99
+ f"Expected COCO dataset files not found for split '{split_definition.split}'. "
100
+ f"Images folder doesn't exist: {split_definition.path_images}"
101
+ )
102
+
103
+ samples_in_split, tasks_in_split = coco_format_folder_with_split_to_hafnia_samples(
104
+ path_label_file=split_definition.path_instances_json,
105
+ max_samples_per_split=max_samples_per_split,
106
+ path_images=split_definition.path_images,
107
+ split_name=split_definition.split,
108
+ )
109
+
110
+ for task_in_split in tasks_in_split:
111
+ matching_tasks = [task for task in tasks if task.name == task_in_split.name]
112
+
113
+ add_missing_task = len(matching_tasks) == 0
114
+ if add_missing_task:
115
+ tasks.append(task_in_split)
116
+ continue
117
+
118
+ if len(matching_tasks) != 1:
119
+ raise ValueError("Duplicate task names found across splits in the COCO dataset.")
120
+ match_task = matching_tasks[0]
121
+ if task_in_split != match_task:
122
+ raise ValueError(
123
+ f"Inconsistent task found across splits in the COCO dataset for task name '{task_in_split.name}'. "
124
+ )
125
+
126
+ samples.extend(samples_in_split)
127
+
128
+ dataset_info = DatasetInfo(
129
+ dataset_name=dataset_name,
130
+ tasks=tasks,
131
+ )
132
+
133
+ hafnia_dataset = HafniaDataset.from_samples_list(samples, info=dataset_info)
134
+ return hafnia_dataset
135
+
136
+
137
+ def coco_format_folder_with_split_to_hafnia_samples(
138
+ path_label_file: Path,
139
+ path_images: Path,
140
+ split_name: str,
141
+ max_samples_per_split: Optional[int],
142
+ ) -> Tuple[List[Sample], List[TaskInfo]]:
143
+ if not path_label_file.exists():
144
+ raise FileNotFoundError(f"Expected label file not found: {path_label_file}")
145
+ user_logger.info("Loading coco label file as json")
146
+ image_and_annotation_dict = json.loads(path_label_file.read_text())
147
+ user_logger.info("Converting coco dataset to HafniaDataset samples")
148
+
149
+ id_to_category, class_names = get_coco_id_category_mapping(image_and_annotation_dict.get("categories", []))
150
+ tasks = [
151
+ TaskInfo(primitive=Bbox, class_names=class_names),
152
+ TaskInfo(primitive=Bitmask, class_names=class_names),
153
+ ]
154
+
155
+ coco_licenses = image_and_annotation_dict.get("licenses", [])
156
+ id_to_license_mapping = {lic["id"]: license_types.get_license_by_url(lic["url"]) for lic in coco_licenses}
157
+
158
+ coco_images = image_and_annotation_dict.get("images", [])
159
+ if max_samples_per_split is not None:
160
+ coco_images = coco_images[:max_samples_per_split]
161
+ id_to_image = {img["id"]: img for img in coco_images}
162
+
163
+ img_id_to_annotations: Dict[int, List[dict]] = {}
164
+ coco_annotations = image_and_annotation_dict.get("annotations", [])
165
+ for annotation in coco_annotations:
166
+ img_id = annotation["image_id"]
167
+ if img_id not in img_id_to_annotations:
168
+ img_id_to_annotations[img_id] = []
169
+ img_id_to_annotations[img_id].append(annotation)
170
+
171
+ samples = []
172
+ for img_id, image_dict in progress_bar(
173
+ id_to_image.items(), description=f"Convert coco to hafnia sample '{split_name}'"
174
+ ):
175
+ image_annotations = img_id_to_annotations.get(img_id, [])
176
+
177
+ sample = fiftyone_coco_to_hafnia_sample(
178
+ path_images=path_images,
179
+ image_dict=image_dict,
180
+ image_annotations=image_annotations,
181
+ id_to_category=id_to_category,
182
+ class_names=class_names,
183
+ id_to_license_mapping=id_to_license_mapping,
184
+ split_name=split_name,
185
+ )
186
+ samples.append(sample)
187
+
188
+ return samples, tasks
189
+
190
+
191
+ def get_coco_id_category_mapping(
192
+ coco_categories: List[dict],
193
+ ) -> Tuple[Dict[int, dict], List[str]]:
194
+ category_mapping = {}
195
+ for i_cat, category in enumerate(coco_categories):
196
+ category = category.copy() # Create a copy to avoid modifying the original dictionary.
197
+ category["class_idx"] = i_cat # Add an index to the category for easier access.
198
+ category_mapping[category["id"]] = category # Map the category ID to the category dictionary.
199
+ sorted_category_mapping = dict(sorted(category_mapping.items(), key=lambda item: item[1]["class_idx"]))
200
+ class_names = [cat_data["name"] for cat_data in sorted_category_mapping.values()]
201
+ return sorted_category_mapping, class_names
202
+
203
+
204
+ def convert_segmentation_to_rle_list(segmentation: Union[Dict, List], height: int, width: int) -> List[Dict]:
205
+ is_polygon_format = isinstance(segmentation, list)
206
+ if is_polygon_format: # Multiple polygons format
207
+ rles = coco_utils.frPyObjects(segmentation, height, width)
208
+ return rles
209
+
210
+ is_rle_format = isinstance(segmentation, dict) and "counts" in segmentation
211
+ if is_rle_format: # RLE format
212
+ counts = segmentation["counts"] # type: ignore
213
+ uncompressed_list_of_ints = isinstance(counts, list)
214
+ if uncompressed_list_of_ints: # Uncompressed RLE. Counts is List[int]
215
+ rles = coco_utils.frPyObjects([segmentation], height, width)
216
+ return rles
217
+
218
+ is_compressed_str_or_bytes = isinstance(counts, str | bytes)
219
+ if is_compressed_str_or_bytes: # Compressed RLE. Counts is str
220
+ rles = [segmentation]
221
+ return rles
222
+
223
+ raise ValueError("Segmentation format not recognized for conversion to RLE.")
224
+
225
+
226
+ def fiftyone_coco_to_hafnia_sample(
227
+ path_images: Path,
228
+ image_dict: Dict,
229
+ image_annotations: List[Dict],
230
+ id_to_category: Dict,
231
+ class_names: List[str],
232
+ id_to_license_mapping: Dict[int, License],
233
+ split_name: str,
234
+ ) -> Sample:
235
+ image_dict = image_dict.copy() # Create a copy to avoid modifying the original dictionary.
236
+ file_name_relative = image_dict.pop(COCO_KEY_FILE_NAME)
237
+ file_name = path_images / file_name_relative
238
+ if not file_name.exists():
239
+ raise FileNotFoundError(f"Expected image file not found: {file_name}. Please check the dataset structure.")
240
+
241
+ img_width = image_dict.pop("width")
242
+ img_height = image_dict.pop("height")
243
+ bitmasks: List[Bitmask] = []
244
+ bboxes: List[Bbox] = []
245
+ for obj_instance in image_annotations:
246
+ category_data = id_to_category[obj_instance["category_id"]]
247
+ class_name = category_data["name"] # Get the name of the category.
248
+ class_idx = class_names.index(class_name)
249
+ bbox_list = obj_instance["bbox"]
250
+ if isinstance(bbox_list[0], float): # Polygon coordinates are often floats.
251
+ bbox_ints = [int(coord) for coord in bbox_list]
252
+ else:
253
+ bbox_ints = bbox_list
254
+ rle_list = convert_segmentation_to_rle_list(obj_instance["segmentation"], height=img_height, width=img_width)
255
+ rle = coco_utils.merge(rle_list)
256
+ rle_string = rle["counts"]
257
+ if isinstance(rle_string, bytes):
258
+ rle_string = rle_string.decode("utf-8")
259
+
260
+ if "area" in obj_instance and obj_instance["area"] is not None:
261
+ area_px = obj_instance["area"]
262
+ else:
263
+ area_px = coco_utils.area(rle).item()
264
+ area = float(area_px) / (img_height * img_width)
265
+ bitmask = Bitmask(
266
+ top=bbox_ints[1],
267
+ left=bbox_ints[0],
268
+ height=bbox_ints[3],
269
+ width=bbox_ints[2],
270
+ area=area,
271
+ rle_string=rle_string,
272
+ class_name=class_name,
273
+ class_idx=class_idx,
274
+ object_id=str(obj_instance["id"]),
275
+ meta={"iscrowd": obj_instance["iscrowd"]},
276
+ )
277
+ bitmasks.append(bitmask)
278
+
279
+ bbox = Bbox.from_coco(bbox=bbox_list, height=img_height, width=img_width)
280
+ bbox.class_name = class_name
281
+ bbox.class_idx = class_idx
282
+ bbox.object_id = str(obj_instance["id"]) # Use the ID from the instance if available.
283
+ bbox.meta = {"iscrowd": obj_instance["iscrowd"]}
284
+ bbox.area = bbox.calculate_area(image_height=img_height, image_width=img_width)
285
+ bboxes.append(bbox)
286
+
287
+ if "license" in image_dict:
288
+ license_data: License = id_to_license_mapping[image_dict["license"]]
289
+
290
+ capture_date = datetime.fromisoformat(image_dict["date_captured"])
291
+ source_url = image_dict["flickr_url"] if "flickr_url" in image_dict else image_dict.get("coco_url")
292
+ attribution = Attribution(
293
+ date_captured=capture_date,
294
+ licenses=[license_data],
295
+ source_url=source_url,
296
+ )
297
+ else:
298
+ attribution = None
299
+
300
+ return Sample(
301
+ file_path=str(file_name),
302
+ width=img_width,
303
+ height=img_height,
304
+ split=split_name,
305
+ bboxes=bboxes, # Bboxes will be added later if needed.
306
+ bitmasks=bitmasks, # Add the bitmask to the sample.
307
+ attribution=attribution,
308
+ meta=image_dict,
309
+ )
310
+
311
+
312
+ def to_coco_format(
313
+ dataset: "HafniaDataset",
314
+ path_output: Path,
315
+ task_name: Optional[str] = None,
316
+ coco_format_type: str = "roboflow",
317
+ ) -> List[CocoSplitPaths]:
318
+ samples_modified_all = dataset.samples.with_row_index("id")
319
+
320
+ if SampleField.ATTRIBUTION in samples_modified_all.columns:
321
+ samples_modified_all = samples_modified_all.unnest(SampleField.ATTRIBUTION)
322
+ license_table = (
323
+ samples_modified_all["licenses"]
324
+ .explode()
325
+ .struct.unnest()
326
+ .unique()
327
+ .with_row_index("id")
328
+ .select(["id", "name", "url"])
329
+ )
330
+ license_mapping = {lic["name"]: lic["id"] for lic in license_table.iter_rows(named=True)}
331
+ else:
332
+ license_mapping = None
333
+ license_table = None
334
+
335
+ if task_name is not None:
336
+ task_info = dataset.info.get_task_by_name(task_name)
337
+ else:
338
+ # Auto derive the task to be used for COCO conversion as only one Bitmask/Bbox task can be present
339
+ # in the coco format. Will first search for Bitmask (because COCO supports segmentation), then Bbox afterwards.
340
+ tasks_info = dataset.info.get_tasks_by_primitive(Bitmask)
341
+ if len(tasks_info) == 0:
342
+ tasks_info = dataset.info.get_tasks_by_primitive(Bbox)
343
+ if len(tasks_info) == 0:
344
+ raise ValueError("No 'Bitmask' or 'Bbox' primitive found in dataset tasks for COCO conversion")
345
+ if len(tasks_info) > 1:
346
+ task_names = [task.name for task in tasks_info]
347
+ raise ValueError(
348
+ f"Found multiple tasks {task_names} for 'Bitmask'/'Bbox' primitive in dataset."
349
+ " Please specify 'task_name'."
350
+ )
351
+ task_info = tasks_info[0]
352
+
353
+ categories_list_dict = [
354
+ {"id": i, "name": c, "supercategory": "NotDefined"} for i, c in enumerate(task_info.class_names or [])
355
+ ]
356
+ category_mapping = {cat["name"]: cat["id"] for cat in categories_list_dict}
357
+
358
+ split_names = samples_modified_all[SampleField.SPLIT].unique().to_list()
359
+
360
+ list_split_paths = []
361
+ for split_name in split_names:
362
+ if coco_format_type == "roboflow":
363
+ path_split = path_output / HAFNIA_TO_ROBOFLOW_SPLIT_NAME[split_name]
364
+ split_paths = format_coco.CocoSplitPaths(
365
+ split=split_name,
366
+ path_images=path_split,
367
+ path_instances_json=path_split / ROBOFLOW_ANNOTATION_FILE_NAME,
368
+ )
369
+ else:
370
+ raise ValueError(f"The specified '{coco_format_type=}' is not supported.")
371
+ samples_in_split = samples_modified_all.filter(pl.col(SampleField.SPLIT) == split_name)
372
+ images_table, annotation_table = _convert_bbox_bitmask_to_coco_format(
373
+ samples_modified=samples_in_split,
374
+ license_mapping=license_mapping,
375
+ task_info=task_info,
376
+ category_mapping=category_mapping, # type: ignore[arg-type]
377
+ )
378
+
379
+ split_paths.path_images.mkdir(parents=True, exist_ok=True)
380
+ src_paths = images_table[COCO_KEY_FILE_NAME].to_list()
381
+ new_relative_image_path = []
382
+ for src_path in src_paths:
383
+ dst_path = split_paths.path_images / Path(src_path).name
384
+ new_relative_image_path.append(dst_path.relative_to(split_paths.path_images).as_posix())
385
+ if dst_path.exists():
386
+ continue
387
+
388
+ shutil.copy2(src_path, dst_path)
389
+
390
+ images_table_files_moved = images_table.with_columns(
391
+ pl.Series(new_relative_image_path).alias(COCO_KEY_FILE_NAME)
392
+ )
393
+ split_labels = {
394
+ "info": dataset.info.model_dump(mode="json"),
395
+ "images": list(images_table_files_moved.iter_rows(named=True)),
396
+ "categories": categories_list_dict,
397
+ "annotations": list(annotation_table.iter_rows(named=True)),
398
+ }
399
+ if license_table is not None:
400
+ split_labels["licenses"] = list(license_table.iter_rows(named=True))
401
+ split_paths.path_instances_json.parent.mkdir(parents=True, exist_ok=True)
402
+ split_paths.path_instances_json.write_text(json.dumps(split_labels))
403
+
404
+ list_split_paths.append(split_paths)
405
+
406
+ return list_split_paths
407
+
408
+
409
+ def _convert_bbox_bitmask_to_coco_format(
410
+ samples_modified: pl.DataFrame,
411
+ license_mapping: Optional[Dict[str, int]],
412
+ task_info: TaskInfo,
413
+ category_mapping: Dict[str, int],
414
+ ) -> Tuple[pl.DataFrame, pl.DataFrame]:
415
+ if task_info.primitive not in [Bbox, Bitmask]:
416
+ raise ValueError(f"Unsupported primitive '{task_info.primitive}' for COCO conversion")
417
+
418
+ task_sample_field = task_info.primitive.column_name()
419
+ select_image_table_columns = [
420
+ pl.col("id"),
421
+ pl.col(SampleField.WIDTH).alias("width"),
422
+ pl.col(SampleField.HEIGHT).alias("height"),
423
+ pl.col(SampleField.FILE_PATH).alias(COCO_KEY_FILE_NAME),
424
+ ]
425
+
426
+ if license_mapping is not None:
427
+ samples_modified = samples_modified.with_columns(pl.col("licenses").list.first().struct.unnest())
428
+ select_image_table_columns = select_image_table_columns + [
429
+ pl.col("name").replace_strict(license_mapping, return_dtype=pl.Int64).alias("license"),
430
+ pl.col("source_url").alias("flickr_url"),
431
+ pl.col("source_url").alias("coco_url"),
432
+ pl.col("date_captured"),
433
+ ]
434
+
435
+ images_table = samples_modified.select(select_image_table_columns)
436
+
437
+ annotation_table_full = (
438
+ samples_modified.select(
439
+ pl.col("id").alias("image_id"),
440
+ pl.col(SampleField.HEIGHT).alias("image_height"),
441
+ pl.col(SampleField.WIDTH).alias("image_width"),
442
+ pl.col(task_sample_field),
443
+ )
444
+ .explode(task_sample_field)
445
+ .with_row_index("id")
446
+ .unnest(task_sample_field)
447
+ )
448
+
449
+ iscrowd_list = [0 if row is None else row.get("iscrowd", 0) for row in annotation_table_full["meta"]]
450
+ annotation_table_full = annotation_table_full.with_columns(pl.Series(iscrowd_list).alias("iscrowd"))
451
+
452
+ if task_info.primitive == Bitmask:
453
+ annotation_table = annotation_table_full.select(
454
+ pl.col("id"),
455
+ pl.col("image_id"),
456
+ category_id=pl.col("class_name").replace_strict(category_mapping, return_dtype=pl.Int64),
457
+ segmentation=pl.struct(
458
+ counts=pl.col("rle_string"),
459
+ size=pl.concat_arr(
460
+ pl.col("image_height"),
461
+ pl.col("image_width"),
462
+ ),
463
+ ),
464
+ area=pl.col("area") * pl.col("image_height") * pl.col("image_width"),
465
+ bbox=pl.concat_arr(
466
+ pl.col("left"), # bbox x coordinate
467
+ pl.col("top"), # bbox y coordinate
468
+ pl.col("width"), # bbox width
469
+ pl.col("height"), # bbox height
470
+ ),
471
+ iscrowd=pl.col("iscrowd"),
472
+ )
473
+
474
+ elif task_info.primitive == Bbox:
475
+ annotation_table = annotation_table_full.select(
476
+ pl.col("id"),
477
+ pl.col("image_id"),
478
+ category_id=pl.col("class_name").replace_strict(category_mapping, return_dtype=pl.Int64),
479
+ segmentation=pl.lit([]),
480
+ area=pl.col("height") * pl.col("width") * pl.col("image_height") * pl.col("image_width"),
481
+ bbox=pl.concat_arr(
482
+ pl.col("top_left_x") * pl.col("image_width"), # x coordinate
483
+ pl.col("top_left_y") * pl.col("image_height"), # y coordinate
484
+ pl.col("width") * pl.col("image_width"), # width
485
+ pl.col("height") * pl.col("image_height"), # height
486
+ ),
487
+ iscrowd=pl.col("iscrowd"),
488
+ )
489
+
490
+ return images_table, annotation_table
@@ -0,0 +1,33 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ from hafnia.dataset.dataset_names import SplitName
6
+ from hafnia.log import user_logger
7
+
8
+
9
+ @dataclass
10
+ class SplitNameAndPath:
11
+ name: str
12
+ path: Path
13
+
14
+ def check(self) -> None:
15
+ if not self.path.is_dir():
16
+ raise ValueError(f"Path '{self.path}' is not a valid directory.")
17
+
18
+ if self.name not in SplitName.valid_splits():
19
+ raise ValueError(f"Split name '{self.name}' is not a valid split name.")
20
+
21
+
22
+ def get_splits_from_folder(path_folder: Path) -> List[SplitNameAndPath]:
23
+ split_name_and_paths = []
24
+ for path_sub_folder in path_folder.iterdir():
25
+ if not path_sub_folder.is_dir():
26
+ continue
27
+ folder_split_name = path_sub_folder.name
28
+ split_name = SplitName.map_split_name(folder_split_name, strict=False)
29
+ if split_name == SplitName.UNDEFINED:
30
+ user_logger.warning(f"Skipping sub-folder with name '{folder_split_name}'")
31
+ continue
32
+ split_name_and_paths.append(SplitNameAndPath(name=split_name, path=path_sub_folder))
33
+ return split_name_and_paths
@@ -5,29 +5,79 @@ from typing import TYPE_CHECKING, List, Optional
5
5
  import more_itertools
6
6
  import polars as pl
7
7
  from PIL import Image
8
- from rich.progress import track
9
8
 
10
9
  from hafnia.dataset.dataset_names import PrimitiveField, SampleField
10
+ from hafnia.dataset.format_conversions.format_helpers import SplitNameAndPath, get_splits_from_folder
11
+ from hafnia.dataset.hafnia_dataset_types import DatasetInfo, Sample, TaskInfo
11
12
  from hafnia.dataset.primitives import Classification
12
- from hafnia.utils import is_image_file
13
+ from hafnia.utils import is_image_file, progress_bar
13
14
 
14
- if TYPE_CHECKING:
15
+ if TYPE_CHECKING: # Using 'TYPE_CHECKING' to avoid circular imports during type checking
15
16
  from hafnia.dataset.hafnia_dataset import HafniaDataset
16
17
 
17
18
 
19
+ DEFAULT_DATASET_NAME = "ImageClassificationDataset"
20
+
21
+
18
22
  def from_image_classification_folder(
23
+ path_folder: Path,
24
+ n_samples: Optional[int] = None,
25
+ dataset_name: str = DEFAULT_DATASET_NAME,
26
+ ) -> "HafniaDataset":
27
+ list_split_name_and_path = get_splits_from_folder(path_folder)
28
+
29
+ dataset = from_image_classification_folder_by_split_paths(
30
+ n_samples=n_samples,
31
+ dataset_name=dataset_name,
32
+ list_split_paths=list_split_name_and_path,
33
+ )
34
+ return dataset
35
+
36
+
37
+ def from_image_classification_folder_by_split_paths(
38
+ list_split_paths: List[SplitNameAndPath],
39
+ dataset_name: str = DEFAULT_DATASET_NAME,
40
+ n_samples: Optional[int] = None,
41
+ ) -> "HafniaDataset":
42
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
43
+
44
+ class_names = sorted(more_itertools.collapse([class_names_from_folder(split.path) for split in list_split_paths]))
45
+
46
+ if n_samples is not None:
47
+ n_samples = n_samples // len(list_split_paths) # Divide samples evenly across splits
48
+ datasets_per_split = []
49
+ for split in list_split_paths:
50
+ dataset_split = from_image_classification_split_folder(
51
+ path_folder=split.path,
52
+ split=split.name,
53
+ n_samples=n_samples,
54
+ class_names=class_names,
55
+ dataset_name=dataset_name,
56
+ )
57
+
58
+ datasets_per_split.append(dataset_split)
59
+
60
+ dataset = HafniaDataset.from_merger(datasets=datasets_per_split)
61
+ dataset.info.dataset_name = dataset_name
62
+ return dataset
63
+
64
+
65
+ def from_image_classification_split_folder(
19
66
  path_folder: Path,
20
67
  split: str,
68
+ dataset_name: str,
21
69
  n_samples: Optional[int] = None,
70
+ class_names: Optional[List[str]] = None,
22
71
  ) -> "HafniaDataset":
23
- from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
72
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
24
73
 
25
- class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
26
- class_names = sorted([folder.name for folder in class_folder_paths]) # Sort for determinism
74
+ if class_names is None:
75
+ class_names = class_names_from_folder(path_folder)
27
76
 
28
77
  # Gather all image paths per class
29
78
  path_images_per_class: List[List[Path]] = []
30
- for path_class_folder in class_folder_paths:
79
+ for class_name in class_names:
80
+ path_class_folder = path_folder / class_name
31
81
  per_class_images = []
32
82
  for path_image in list(path_class_folder.rglob("*.*")):
33
83
  if is_image_file(path_image):
@@ -41,7 +91,9 @@ def from_image_classification_folder(
41
91
  path_images = path_images[:n_samples]
42
92
 
43
93
  samples = []
44
- for path_image_org in track(path_images, description="Convert 'image classification' dataset to Hafnia Dataset"):
94
+ for path_image_org in progress_bar(
95
+ path_images, description="Convert 'image classification' dataset to Hafnia Dataset"
96
+ ):
45
97
  class_name = path_image_org.parent.name
46
98
 
47
99
  read_image = Image.open(path_image_org)
@@ -58,7 +110,7 @@ def from_image_classification_folder(
58
110
  samples.append(sample)
59
111
 
60
112
  dataset_info = DatasetInfo(
61
- dataset_name="ImageClassificationFromDirectoryTree",
113
+ dataset_name=dataset_name,
62
114
  tasks=[TaskInfo(primitive=Classification, class_names=class_names)],
63
115
  )
64
116
 
@@ -71,6 +123,29 @@ def to_image_classification_folder(
71
123
  path_output: Path,
72
124
  task_name: Optional[str] = None,
73
125
  clean_folder: bool = False,
126
+ ) -> List[Path]:
127
+ task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
128
+
129
+ split_names = dataset.samples[SampleField.SPLIT].unique().to_list()
130
+ split_paths = []
131
+ for split_name in split_names:
132
+ dataset_split = dataset.create_split_dataset(split_name)
133
+ split_path = to_image_classification_split_folder(
134
+ dataset=dataset_split,
135
+ path_output_split=path_output / split_name,
136
+ task_name=task.name,
137
+ clean_folder=clean_folder,
138
+ )
139
+ split_paths.append(split_path)
140
+
141
+ return split_paths
142
+
143
+
144
+ def to_image_classification_split_folder(
145
+ dataset: "HafniaDataset",
146
+ path_output_split: Path,
147
+ task_name: Optional[str] = None,
148
+ clean_folder: bool = False,
74
149
  ) -> Path:
75
150
  task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
76
151
 
@@ -90,21 +165,27 @@ def to_image_classification_folder(
90
165
  raise ValueError(f"Some samples have multiple classifications for task '{task.name}'.")
91
166
 
92
167
  if clean_folder:
93
- shutil.rmtree(path_output, ignore_errors=True)
94
- path_output.mkdir(parents=True, exist_ok=True)
168
+ shutil.rmtree(path_output_split, ignore_errors=True)
169
+ path_output_split.mkdir(parents=True, exist_ok=True)
95
170
 
96
171
  description = "Export Hafnia Dataset to directory tree"
97
- for sample_dict in track(samples.iter_rows(named=True), total=len(samples), description=description):
172
+ for sample_dict in progress_bar(samples.iter_rows(named=True), total=len(samples), description=description):
98
173
  classifications = sample_dict[task.primitive.column_name()]
99
174
  if len(classifications) != 1:
100
175
  raise ValueError("Each sample should have exactly one classification.")
101
176
  classification = classifications[0]
102
177
  class_name = classification[PrimitiveField.CLASS_NAME].replace("/", "_") # Avoid issues with subfolders
103
- path_class_folder = path_output / class_name
178
+ path_class_folder = path_output_split / class_name
104
179
  path_class_folder.mkdir(parents=True, exist_ok=True)
105
180
 
106
181
  path_image_org = Path(sample_dict[SampleField.FILE_PATH])
107
182
  path_image_new = path_class_folder / path_image_org.name
108
183
  shutil.copy2(path_image_org, path_image_new)
109
184
 
110
- return path_output
185
+ return path_output_split
186
+
187
+
188
+ def class_names_from_folder(path_folder: Path) -> List[str]:
189
+ class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
190
+ class_names = sorted([folder.name for folder in class_folder_paths]) # Sort for determinism
191
+ return class_names