hafnia 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} +114 -191
  2. hafnia/dataset/dataset_names.py +26 -0
  3. hafnia/dataset/format_conversions/format_coco.py +490 -0
  4. hafnia/dataset/format_conversions/format_helpers.py +33 -0
  5. hafnia/dataset/format_conversions/format_image_classification_folder.py +95 -14
  6. hafnia/dataset/format_conversions/format_yolo.py +115 -25
  7. hafnia/dataset/format_conversions/torchvision_datasets.py +10 -8
  8. hafnia/dataset/hafnia_dataset.py +20 -466
  9. hafnia/dataset/hafnia_dataset_types.py +477 -0
  10. hafnia/dataset/license_types.py +4 -4
  11. hafnia/dataset/operations/dataset_stats.py +3 -3
  12. hafnia/dataset/operations/dataset_transformations.py +14 -17
  13. hafnia/dataset/operations/table_transformations.py +20 -13
  14. hafnia/dataset/primitives/bbox.py +6 -2
  15. hafnia/dataset/primitives/bitmask.py +21 -46
  16. hafnia/dataset/primitives/classification.py +1 -1
  17. hafnia/dataset/primitives/polygon.py +43 -2
  18. hafnia/dataset/primitives/primitive.py +1 -1
  19. hafnia/dataset/primitives/segmentation.py +1 -1
  20. hafnia/experiment/hafnia_logger.py +13 -4
  21. hafnia/platform/datasets.py +2 -3
  22. hafnia/torch_helpers.py +48 -4
  23. hafnia/utils.py +34 -0
  24. hafnia/visualizations/image_visualizations.py +3 -1
  25. {hafnia-0.4.2.dist-info → hafnia-0.4.3.dist-info}/METADATA +2 -2
  26. {hafnia-0.4.2.dist-info → hafnia-0.4.3.dist-info}/RECORD +29 -26
  27. {hafnia-0.4.2.dist-info → hafnia-0.4.3.dist-info}/WHEEL +0 -0
  28. {hafnia-0.4.2.dist-info → hafnia-0.4.3.dist-info}/entry_points.txt +0 -0
  29. {hafnia-0.4.2.dist-info → hafnia-0.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,6 @@ import polars as pl
11
11
  from PIL import Image
12
12
  from pydantic import BaseModel, ConfigDict, field_validator
13
13
 
14
- from hafnia.dataset import primitives
15
14
  from hafnia.dataset.dataset_names import (
16
15
  DatasetVariant,
17
16
  DeploymentStage,
@@ -19,7 +18,8 @@ from hafnia.dataset.dataset_names import (
19
18
  SampleField,
20
19
  SplitName,
21
20
  )
22
- from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
21
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
22
+ from hafnia.dataset.hafnia_dataset_types import Attribution, Sample, TaskInfo
23
23
  from hafnia.dataset.operations import table_transformations
24
24
  from hafnia.dataset.primitives import (
25
25
  Bbox,
@@ -41,7 +41,7 @@ def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -
41
41
  return f"mdi-{deployment_stage.value}-{dataset_name}"
42
42
 
43
43
 
44
- class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
44
+ class DatasetDetails(BaseModel, validate_assignment=True): # type: ignore[call-arg]
45
45
  model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
46
46
  name: str
47
47
  data_captured_start: Optional[datetime] = None
@@ -150,14 +150,6 @@ class DbAnnotationType(BaseModel, validate_assignment=True): # type: ignore[cal
150
150
  name: str
151
151
 
152
152
 
153
- class AnnotationType(Enum):
154
- ImageClassification = "Image Classification"
155
- ObjectDetection = "Object Detection"
156
- SegmentationMask = "Segmentation Mask"
157
- ImageCaptioning = "Image Captioning"
158
- InstanceSegmentation = "Instance Segmentation"
159
-
160
-
161
153
  class DbResolution(BaseModel, validate_assignment=True): # type: ignore[call-arg]
162
154
  height: int
163
155
  width: int
@@ -289,7 +281,7 @@ def get_folder_size(path: Path) -> int:
289
281
  return sum([path.stat().st_size for path in path.rglob("*")])
290
282
 
291
283
 
292
- def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
284
+ def upload_to_hafnia_dataset_detail_page(dataset_update: DatasetDetails, upload_gallery_images: bool) -> dict:
293
285
  if not upload_gallery_images:
294
286
  dataset_update.imgs = None
295
287
 
@@ -322,18 +314,6 @@ def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -
322
314
  return resolutions
323
315
 
324
316
 
325
- def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Type[Primitive]) -> bool:
326
- col_name = PrimitiveType.column_name()
327
- table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
328
- if col_name not in table.columns:
329
- return False
330
-
331
- if table[col_name].dtype == pl.Null:
332
- return False
333
-
334
- return True
335
-
336
-
337
317
  def calculate_distribution_values(
338
318
  dataset_split: pl.DataFrame, distribution_tasks: Optional[List[TaskInfo]]
339
319
  ) -> List[DbDistributionValue]:
@@ -378,15 +358,15 @@ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: bot
378
358
  return last_modified, size
379
359
 
380
360
 
381
- def dataset_info_from_dataset(
361
+ def dataset_details_from_hafnia_dataset(
382
362
  dataset: HafniaDataset,
383
363
  deployment_stage: DeploymentStage,
384
364
  path_sample: Optional[Path],
385
365
  path_hidden: Optional[Path],
386
366
  path_gallery_images: Optional[Path] = None,
387
367
  gallery_image_names: Optional[List[str]] = None,
388
- distribution_task_names: Optional[List[TaskInfo]] = None,
389
- ) -> DbDataset:
368
+ distribution_task_names: Optional[List[str]] = None,
369
+ ) -> DatasetDetails:
390
370
  dataset_variants = []
391
371
  dataset_reports = []
392
372
  dataset_meta_info = dataset.info.meta or {}
@@ -448,177 +428,20 @@ def dataset_info_from_dataset(
448
428
  )
449
429
 
450
430
  object_reports: List[DbAnnotatedObjectReport] = []
451
- primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
452
- if has_primitive(dataset_split, PrimitiveType=Bbox):
453
- df_per_instance = table_transformations.create_primitive_table(
454
- dataset_split, PrimitiveType=Bbox, keep_sample_data=True
455
- )
456
- if df_per_instance is None:
457
- raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
458
- # Calculate area of bounding boxes
459
- df_per_instance = df_per_instance.with_columns(
460
- (pl.col("height") * pl.col("width")).alias("area"),
461
- ).with_columns(
462
- (pl.col("height") * pl.col("image.height")).alias("height_px"),
463
- (pl.col("width") * pl.col("image.width")).alias("width_px"),
464
- (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
465
- )
466
-
467
- annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
468
- for (class_name, task_name), class_group in df_per_instance.group_by(
469
- PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
470
- ):
471
- if class_name is None:
472
- continue
473
- object_reports.append(
474
- DbAnnotatedObjectReport(
475
- obj=DbAnnotatedObject(
476
- name=class_name,
477
- entity_type=EntityTypeChoices.OBJECT.value,
478
- annotation_type=annotation_type,
479
- task_name=task_name,
480
- ),
481
- unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
482
- obj_instances=len(class_group),
483
- annotation_type=[annotation_type],
484
- images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
485
- area_avg_ratio=class_group["area"].mean(),
486
- area_min_ratio=class_group["area"].min(),
487
- area_max_ratio=class_group["area"].max(),
488
- height_avg_ratio=class_group["height"].mean(),
489
- height_min_ratio=class_group["height"].min(),
490
- height_max_ratio=class_group["height"].max(),
491
- width_avg_ratio=class_group["width"].mean(),
492
- width_min_ratio=class_group["width"].min(),
493
- width_max_ratio=class_group["width"].max(),
494
- area_avg_px=class_group["area_px"].mean(),
495
- area_min_px=int(class_group["area_px"].min()),
496
- area_max_px=int(class_group["area_px"].max()),
497
- height_avg_px=class_group["height_px"].mean(),
498
- height_min_px=int(class_group["height_px"].min()),
499
- height_max_px=int(class_group["height_px"].max()),
500
- width_avg_px=class_group["width_px"].mean(),
501
- width_min_px=int(class_group["width_px"].min()),
502
- width_max_px=int(class_group["width_px"].max()),
503
- average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
504
- )
505
- )
506
-
507
- if has_primitive(dataset_split, PrimitiveType=Classification):
508
- annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
509
- col_name = Classification.column_name()
510
- classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
511
- has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
512
- if has_classification_data:
513
- classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
514
-
515
- # Include only classification tasks that are defined in the dataset info
516
- classification_df = classification_df.filter(
517
- pl.col(PrimitiveField.TASK_NAME).is_in(classification_tasks)
518
- )
519
-
520
- for (
521
- task_name,
522
- class_name,
523
- ), class_group in classification_df.group_by(PrimitiveField.TASK_NAME, PrimitiveField.CLASS_NAME):
524
- if class_name is None:
525
- continue
526
- if task_name == Classification.default_task_name():
527
- display_name = class_name # Prefix class name with task name
528
- else:
529
- display_name = f"{task_name}.{class_name}"
530
- object_reports.append(
531
- DbAnnotatedObjectReport(
532
- obj=DbAnnotatedObject(
533
- name=display_name,
534
- entity_type=EntityTypeChoices.EVENT.value,
535
- annotation_type=annotation_type,
536
- task_name=task_name,
537
- ),
538
- unique_obj_ids=len(
539
- class_group
540
- ), # Unique object IDs are not applicable for classification
541
- obj_instances=len(class_group),
542
- annotation_type=[annotation_type],
543
- )
544
- )
545
-
546
- if has_primitive(dataset_split, PrimitiveType=Segmentation):
547
- raise NotImplementedError("Not Implemented yet")
548
-
549
- if has_primitive(dataset_split, PrimitiveType=Bitmask):
550
- col_name = Bitmask.column_name()
551
- drop_columns = [col for col in primitive_columns if col != col_name]
552
- drop_columns.append(PrimitiveField.META)
553
-
554
- df_per_instance = table_transformations.create_primitive_table(
555
- dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
556
- )
557
- if df_per_instance is None:
558
- raise ValueError(
559
- f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
560
- )
561
- df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
562
- df_per_instance = df_per_instance.with_columns(
563
- (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
564
- (pl.col("height_px") / pl.col("image.height")).alias("height"),
565
- (pl.col("width_px") / pl.col("image.width")).alias("width"),
566
- )
567
-
568
- annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
569
- for (class_name, task_name), class_group in df_per_instance.group_by(
570
- PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
571
- ):
572
- if class_name is None:
573
- continue
574
- object_reports.append(
575
- DbAnnotatedObjectReport(
576
- obj=DbAnnotatedObject(
577
- name=class_name,
578
- entity_type=EntityTypeChoices.OBJECT.value,
579
- annotation_type=annotation_type,
580
- task_name=task_name,
581
- ),
582
- unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
583
- obj_instances=len(class_group),
584
- annotation_type=[annotation_type],
585
- average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
586
- images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
587
- area_avg_ratio=class_group["area"].mean(),
588
- area_min_ratio=class_group["area"].min(),
589
- area_max_ratio=class_group["area"].max(),
590
- height_avg_ratio=class_group["height"].mean(),
591
- height_min_ratio=class_group["height"].min(),
592
- height_max_ratio=class_group["height"].max(),
593
- width_avg_ratio=class_group["width"].mean(),
594
- width_min_ratio=class_group["width"].min(),
595
- width_max_ratio=class_group["width"].max(),
596
- area_avg_px=class_group["area_px"].mean(),
597
- area_min_px=int(class_group["area_px"].min()),
598
- area_max_px=int(class_group["area_px"].max()),
599
- height_avg_px=class_group["height_px"].mean(),
600
- height_min_px=int(class_group["height_px"].min()),
601
- height_max_px=int(class_group["height_px"].max()),
602
- width_avg_px=class_group["width_px"].mean(),
603
- width_min_px=int(class_group["width_px"].min()),
604
- width_max_px=int(class_group["width_px"].max()),
605
- )
606
- )
607
-
608
- if has_primitive(dataset_split, PrimitiveType=Polygon):
609
- raise NotImplementedError("Not Implemented yet")
431
+ for PrimitiveType in [Classification, Bbox, Bitmask, Polygon, Segmentation]:
432
+ object_reports.extend(create_reports_from_primitive(dataset_split, PrimitiveType=PrimitiveType)) # type: ignore[type-abstract]
610
433
 
611
434
  # Sort object reports by name to more easily compare between versions
612
435
  object_reports = sorted(object_reports, key=lambda x: x.obj.name) # Sort object reports by name
613
436
  report.annotated_object_reports = object_reports
614
437
 
615
- if report.distribution_values is None:
616
- report.distribution_values = []
438
+ if report.distribution_values is None:
439
+ report.distribution_values = []
617
440
 
618
- dataset_reports.append(report)
441
+ dataset_reports.append(report)
619
442
  dataset_name = dataset.info.dataset_name
620
443
  bucket_sample = generate_bucket_name(dataset_name, deployment_stage=deployment_stage)
621
- dataset_info = DbDataset(
444
+ dataset_info = DatasetDetails(
622
445
  name=dataset_name,
623
446
  version=dataset.info.version,
624
447
  s3_bucket_name=bucket_sample,
@@ -639,6 +462,101 @@ def dataset_info_from_dataset(
639
462
  return dataset_info
640
463
 
641
464
 
465
+ def create_reports_from_primitive(
466
+ dataset_split: pl.DataFrame, PrimitiveType: Type[Primitive]
467
+ ) -> List[DbAnnotatedObjectReport]:
468
+ if not table_transformations.has_primitive(dataset_split, PrimitiveType=PrimitiveType):
469
+ return []
470
+
471
+ if PrimitiveType == Segmentation:
472
+ raise NotImplementedError("Not Implemented yet")
473
+
474
+ df_per_instance = table_transformations.create_primitive_table(
475
+ dataset_split, PrimitiveType=PrimitiveType, keep_sample_data=True
476
+ )
477
+ if df_per_instance is None:
478
+ raise ValueError(f"Expected {PrimitiveType.__name__} primitive column to be present in the dataset split.")
479
+
480
+ entity_type = EntityTypeChoices.OBJECT.value
481
+ if PrimitiveType == Classification:
482
+ entity_type = EntityTypeChoices.EVENT.value
483
+
484
+ if PrimitiveType == Bbox:
485
+ df_per_instance = df_per_instance.with_columns(area=pl.col("height") * pl.col("width"))
486
+
487
+ if PrimitiveType == Bitmask:
488
+ # width and height are in pixel format for Bitmask convert to ratio
489
+ df_per_instance = df_per_instance.with_columns(
490
+ width=pl.col("width") / pl.col("image.width"),
491
+ height=pl.col("height") / pl.col("image.height"),
492
+ )
493
+
494
+ has_height_field = "height" in df_per_instance.columns and df_per_instance["height"].dtype != pl.Null
495
+ if has_height_field:
496
+ df_per_instance = df_per_instance.with_columns(
497
+ height_px=pl.col("height") * pl.col("image.height"),
498
+ )
499
+
500
+ has_width_field = "width" in df_per_instance.columns and df_per_instance["width"].dtype != pl.Null
501
+ if has_width_field:
502
+ df_per_instance = df_per_instance.with_columns(
503
+ width_px=pl.col("width") * pl.col("image.width"),
504
+ )
505
+
506
+ has_area_field = "area" in df_per_instance.columns and df_per_instance["area"].dtype != pl.Null
507
+ if has_area_field:
508
+ df_per_instance = df_per_instance.with_columns(
509
+ area_px=pl.col("image.height") * pl.col("image.width") * pl.col("area")
510
+ )
511
+ object_reports: List[DbAnnotatedObjectReport] = []
512
+ annotation_type = DbAnnotationType(name=PrimitiveType.__name__)
513
+ for (class_name, task_name), class_group in df_per_instance.group_by(
514
+ PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
515
+ ):
516
+ if class_name is None:
517
+ continue
518
+
519
+ object_report = DbAnnotatedObjectReport(
520
+ obj=DbAnnotatedObject(
521
+ name=class_name,
522
+ entity_type=entity_type,
523
+ annotation_type=annotation_type,
524
+ task_name=task_name,
525
+ ),
526
+ unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
527
+ obj_instances=len(class_group),
528
+ annotation_type=[annotation_type],
529
+ average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
530
+ images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
531
+ )
532
+ if has_height_field:
533
+ object_report.height_avg_ratio = class_group["height"].mean()
534
+ object_report.height_min_ratio = class_group["height"].min()
535
+ object_report.height_max_ratio = class_group["height"].max()
536
+ object_report.height_avg_px = class_group["height_px"].mean()
537
+ object_report.height_min_px = int(class_group["height_px"].min())
538
+ object_report.height_max_px = int(class_group["height_px"].max())
539
+
540
+ if has_width_field:
541
+ object_report.width_avg_ratio = class_group["width"].mean()
542
+ object_report.width_min_ratio = class_group["width"].min()
543
+ object_report.width_max_ratio = class_group["width"].max()
544
+ object_report.width_avg_px = class_group["width_px"].mean()
545
+ object_report.width_min_px = int(class_group["width_px"].min())
546
+ object_report.width_max_px = int(class_group["width_px"].max())
547
+
548
+ if has_area_field:
549
+ object_report.area_avg_ratio = class_group["area"].mean()
550
+ object_report.area_min_ratio = class_group["area"].min()
551
+ object_report.area_max_ratio = class_group["area"].max()
552
+ object_report.area_avg_px = class_group["area_px"].mean()
553
+ object_report.area_min_px = int(class_group["area_px"].min())
554
+ object_report.area_max_px = int(class_group["area_px"].max())
555
+
556
+ object_reports.append(object_report)
557
+ return object_reports
558
+
559
+
642
560
  def create_gallery_images(
643
561
  dataset: HafniaDataset,
644
562
  path_gallery_images: Optional[Path],
@@ -657,7 +575,12 @@ def create_gallery_images(
657
575
 
658
576
  missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
659
577
  if len(missing_gallery_samples):
660
- raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
578
+ potential_samples = samples[COL_IMAGE_NAME].sort().to_list()
579
+ formatted_samples = ", ".join([f'"{s}"' for s in potential_samples[:9]])
580
+ raise ValueError(
581
+ f"Gallery images not found in dataset: {missing_gallery_samples}. "
582
+ f"Consider adding this to dataset definition: \ngallery_image_names=[{formatted_samples}]"
583
+ )
661
584
  gallery_images = []
662
585
  for gallery_sample in gallery_samples.iter_rows(named=True):
663
586
  sample = Sample(**gallery_sample)
@@ -93,6 +93,32 @@ class SplitName:
93
93
  def all_split_names() -> List[str]:
94
94
  return [*SplitName.valid_splits(), SplitName.UNDEFINED]
95
95
 
96
+ @staticmethod
97
+ def map_split_name(potential_split_name: str, strict: bool = True) -> str:
98
+ normalized = potential_split_name.strip().lower()
99
+
100
+ if normalized in SPLIT_NAME_MAPPINGS:
101
+ return SPLIT_NAME_MAPPINGS[normalized]
102
+
103
+ if strict:
104
+ raise ValueError(f"Unrecognized split name: {potential_split_name}")
105
+ else:
106
+ return SplitName.UNDEFINED
107
+
108
+
109
+ SPLIT_NAME_MAPPINGS = {
110
+ # Train variations
111
+ "train": SplitName.TRAIN,
112
+ "training": SplitName.TRAIN,
113
+ # Validation variations
114
+ "validation": SplitName.VAL,
115
+ "val": SplitName.VAL,
116
+ "valid": SplitName.VAL,
117
+ # Test variations
118
+ "test": SplitName.TEST,
119
+ "testing": SplitName.TEST,
120
+ }
121
+
96
122
 
97
123
  class DatasetVariant(Enum):
98
124
  DUMP = "dump"