hafnia 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} +148 -238
  2. hafnia/dataset/dataset_helpers.py +1 -15
  3. hafnia/dataset/dataset_names.py +43 -3
  4. hafnia/dataset/format_conversions/format_coco.py +490 -0
  5. hafnia/dataset/format_conversions/format_helpers.py +33 -0
  6. hafnia/dataset/format_conversions/format_image_classification_folder.py +95 -14
  7. hafnia/dataset/format_conversions/format_yolo.py +115 -25
  8. hafnia/dataset/format_conversions/torchvision_datasets.py +16 -11
  9. hafnia/dataset/hafnia_dataset.py +119 -490
  10. hafnia/dataset/hafnia_dataset_types.py +479 -0
  11. hafnia/dataset/license_types.py +4 -4
  12. hafnia/dataset/operations/dataset_s3_storage.py +211 -0
  13. hafnia/dataset/operations/dataset_stats.py +3 -3
  14. hafnia/dataset/operations/dataset_transformations.py +14 -17
  15. hafnia/dataset/operations/table_transformations.py +22 -14
  16. hafnia/dataset/primitives/bbox.py +6 -2
  17. hafnia/dataset/primitives/bitmask.py +21 -46
  18. hafnia/dataset/primitives/classification.py +1 -1
  19. hafnia/dataset/primitives/polygon.py +43 -2
  20. hafnia/dataset/primitives/primitive.py +1 -1
  21. hafnia/dataset/primitives/segmentation.py +1 -1
  22. hafnia/experiment/hafnia_logger.py +13 -4
  23. hafnia/http.py +2 -1
  24. hafnia/platform/datasets.py +195 -105
  25. hafnia/platform/s5cmd_utils.py +147 -0
  26. hafnia/torch_helpers.py +48 -4
  27. hafnia/utils.py +38 -0
  28. hafnia/visualizations/image_visualizations.py +3 -1
  29. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/METADATA +4 -4
  30. hafnia-0.5.0.dist-info/RECORD +62 -0
  31. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/WHEEL +1 -1
  32. hafnia_cli/dataset_cmds.py +18 -0
  33. hafnia_cli/profile_cmds.py +0 -1
  34. hafnia-0.4.2.dist-info/RECORD +0 -57
  35. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/entry_points.txt +0 -0
  36. {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,22 +4,21 @@ import base64
4
4
  from datetime import datetime
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
7
+ from typing import Any, Dict, List, Optional, Type, Union
8
8
 
9
9
  import boto3
10
10
  import polars as pl
11
11
  from PIL import Image
12
12
  from pydantic import BaseModel, ConfigDict, field_validator
13
13
 
14
- from hafnia.dataset import primitives
15
14
  from hafnia.dataset.dataset_names import (
16
15
  DatasetVariant,
17
- DeploymentStage,
18
16
  PrimitiveField,
19
17
  SampleField,
20
18
  SplitName,
21
19
  )
22
- from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
20
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
21
+ from hafnia.dataset.hafnia_dataset_types import Attribution, Sample, TaskInfo
23
22
  from hafnia.dataset.operations import table_transformations
24
23
  from hafnia.dataset.primitives import (
25
24
  Bbox,
@@ -29,26 +28,21 @@ from hafnia.dataset.primitives import (
29
28
  Segmentation,
30
29
  )
31
30
  from hafnia.dataset.primitives.primitive import Primitive
32
- from hafnia.http import post
33
- from hafnia.log import user_logger
34
- from hafnia.platform.datasets import get_dataset_id
31
+ from hafnia.platform.datasets import upload_dataset_details
32
+ from hafnia.utils import get_path_dataset_gallery_images
35
33
  from hafnia_cli.config import Config
36
34
 
37
35
 
38
- def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
39
- # TODO: When moving to versioning we do NOT need 'staging' and 'production' specific buckets
40
- # and the new name convention should be: f"hafnia-dataset-{dataset_name}"
41
- return f"mdi-{deployment_stage.value}-{dataset_name}"
42
-
43
-
44
- class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
36
+ class DatasetDetails(BaseModel, validate_assignment=True): # type: ignore[call-arg]
45
37
  model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
46
38
  name: str
39
+ title: Optional[str] = None
40
+ overview: Optional[str] = None
47
41
  data_captured_start: Optional[datetime] = None
48
42
  data_captured_end: Optional[datetime] = None
49
43
  data_received_start: Optional[datetime] = None
50
44
  data_received_end: Optional[datetime] = None
51
- latest_update: Optional[datetime] = None
45
+ dataset_updated_at: Optional[datetime] = None
52
46
  license_citation: Optional[str] = None
53
47
  version: Optional[str] = None
54
48
  s3_bucket_name: Optional[str] = None
@@ -150,14 +144,6 @@ class DbAnnotationType(BaseModel, validate_assignment=True): # type: ignore[cal
150
144
  name: str
151
145
 
152
146
 
153
- class AnnotationType(Enum):
154
- ImageClassification = "Image Classification"
155
- ObjectDetection = "Object Detection"
156
- SegmentationMask = "Segmentation Mask"
157
- ImageCaptioning = "Image Captioning"
158
- InstanceSegmentation = "Instance Segmentation"
159
-
160
-
161
147
  class DbResolution(BaseModel, validate_assignment=True): # type: ignore[call-arg]
162
148
  height: int
163
149
  width: int
@@ -289,26 +275,32 @@ def get_folder_size(path: Path) -> int:
289
275
  return sum([path.stat().st_size for path in path.rglob("*")])
290
276
 
291
277
 
292
- def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
293
- if not upload_gallery_images:
294
- dataset_update.imgs = None
295
-
296
- cfg = Config()
297
- dataset_details = dataset_update.model_dump_json()
298
- data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
299
- return data
300
-
301
-
302
- def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
303
- dataset_endpoint = cfg.get_platform_endpoint("datasets")
304
- dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
278
+ def upload_dataset_details_to_platform(
279
+ dataset: HafniaDataset,
280
+ path_gallery_images: Optional[Path] = None,
281
+ gallery_image_names: Optional[List[str]] = None,
282
+ distribution_task_names: Optional[List[str]] = None,
283
+ update_platform: bool = True,
284
+ cfg: Optional[Config] = None,
285
+ ) -> dict:
286
+ cfg = cfg or Config()
287
+ dataset_details = dataset_details_from_hafnia_dataset(
288
+ dataset=dataset,
289
+ path_gallery_images=path_gallery_images,
290
+ gallery_image_names=gallery_image_names,
291
+ distribution_task_names=distribution_task_names,
292
+ )
305
293
 
306
- import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
307
- headers = {"Authorization": cfg.api_key}
294
+ if update_platform:
295
+ dataset_details_exclude_none = dataset_details.model_dump(exclude_none=True, mode="json")
296
+ upload_dataset_details(
297
+ cfg=cfg,
298
+ data=dataset_details_exclude_none,
299
+ dataset_name=dataset_details.name,
300
+ )
308
301
 
309
- user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
310
- response = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
311
- return response # type: ignore[return-value]
302
+ dataset_details_dict = dataset_details.model_dump(exclude_none=False, mode="json")
303
+ return dataset_details_dict
312
304
 
313
305
 
314
306
  def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -322,18 +314,6 @@ def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -
322
314
  return resolutions
323
315
 
324
316
 
325
- def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Type[Primitive]) -> bool:
326
- col_name = PrimitiveType.column_name()
327
- table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
328
- if col_name not in table.columns:
329
- return False
330
-
331
- if table[col_name].dtype == pl.Null:
332
- return False
333
-
334
- return True
335
-
336
-
337
317
  def calculate_distribution_values(
338
318
  dataset_split: pl.DataFrame, distribution_tasks: Optional[List[TaskInfo]]
339
319
  ) -> List[DbDistributionValue]:
@@ -378,46 +358,34 @@ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: bot
378
358
  return last_modified, size
379
359
 
380
360
 
381
- def dataset_info_from_dataset(
361
+ def dataset_details_from_hafnia_dataset(
382
362
  dataset: HafniaDataset,
383
- deployment_stage: DeploymentStage,
384
- path_sample: Optional[Path],
385
- path_hidden: Optional[Path],
386
363
  path_gallery_images: Optional[Path] = None,
387
364
  gallery_image_names: Optional[List[str]] = None,
388
- distribution_task_names: Optional[List[TaskInfo]] = None,
389
- ) -> DbDataset:
365
+ distribution_task_names: Optional[List[str]] = None,
366
+ ) -> DatasetDetails:
390
367
  dataset_variants = []
391
368
  dataset_reports = []
392
369
  dataset_meta_info = dataset.info.meta or {}
393
370
 
394
- path_and_variant: List[Tuple[Path, DatasetVariant]] = []
395
- if path_sample is not None:
396
- path_and_variant.append((path_sample, DatasetVariant.SAMPLE))
397
-
398
- if path_hidden is not None:
399
- path_and_variant.append((path_hidden, DatasetVariant.HIDDEN))
400
-
401
- if len(path_and_variant) == 0:
402
- raise ValueError("At least one path must be provided for sample or hidden dataset.")
403
-
371
+ path_and_variant = [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]
404
372
  gallery_images = create_gallery_images(
405
373
  dataset=dataset,
406
374
  path_gallery_images=path_gallery_images,
407
375
  gallery_image_names=gallery_image_names,
408
376
  )
409
377
 
410
- for path_dataset, variant_type in path_and_variant:
378
+ for variant_type in path_and_variant:
411
379
  if variant_type == DatasetVariant.SAMPLE:
412
380
  dataset_variant = dataset.create_sample_dataset()
413
381
  else:
414
382
  dataset_variant = dataset
415
383
 
416
- size_bytes = get_folder_size(path_dataset)
384
+ files_paths = dataset_variant.samples[SampleField.FILE_PATH].to_list()
385
+ size_bytes = sum([Path(file_path).stat().st_size for file_path in files_paths])
417
386
  dataset_variants.append(
418
387
  DbDatasetVariant(
419
388
  variant_type=VARIANT_TYPE_MAPPING[variant_type], # type: ignore[index]
420
- # upload_date: Optional[datetime] = None
421
389
  size_bytes=size_bytes,
422
390
  data_type=DataTypeChoices.images,
423
391
  number_of_data_items=len(dataset_variant),
@@ -425,7 +393,6 @@ def dataset_info_from_dataset(
425
393
  duration=dataset_meta_info.get("duration", None),
426
394
  duration_average=dataset_meta_info.get("duration_average", None),
427
395
  frame_rate=dataset_meta_info.get("frame_rate", None),
428
- # bit_rate: Optional[float] = None
429
396
  n_cameras=dataset_meta_info.get("n_cameras", None),
430
397
  )
431
398
  )
@@ -448,165 +415,8 @@ def dataset_info_from_dataset(
448
415
  )
449
416
 
450
417
  object_reports: List[DbAnnotatedObjectReport] = []
451
- primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
452
- if has_primitive(dataset_split, PrimitiveType=Bbox):
453
- df_per_instance = table_transformations.create_primitive_table(
454
- dataset_split, PrimitiveType=Bbox, keep_sample_data=True
455
- )
456
- if df_per_instance is None:
457
- raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
458
- # Calculate area of bounding boxes
459
- df_per_instance = df_per_instance.with_columns(
460
- (pl.col("height") * pl.col("width")).alias("area"),
461
- ).with_columns(
462
- (pl.col("height") * pl.col("image.height")).alias("height_px"),
463
- (pl.col("width") * pl.col("image.width")).alias("width_px"),
464
- (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
465
- )
466
-
467
- annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
468
- for (class_name, task_name), class_group in df_per_instance.group_by(
469
- PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
470
- ):
471
- if class_name is None:
472
- continue
473
- object_reports.append(
474
- DbAnnotatedObjectReport(
475
- obj=DbAnnotatedObject(
476
- name=class_name,
477
- entity_type=EntityTypeChoices.OBJECT.value,
478
- annotation_type=annotation_type,
479
- task_name=task_name,
480
- ),
481
- unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
482
- obj_instances=len(class_group),
483
- annotation_type=[annotation_type],
484
- images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
485
- area_avg_ratio=class_group["area"].mean(),
486
- area_min_ratio=class_group["area"].min(),
487
- area_max_ratio=class_group["area"].max(),
488
- height_avg_ratio=class_group["height"].mean(),
489
- height_min_ratio=class_group["height"].min(),
490
- height_max_ratio=class_group["height"].max(),
491
- width_avg_ratio=class_group["width"].mean(),
492
- width_min_ratio=class_group["width"].min(),
493
- width_max_ratio=class_group["width"].max(),
494
- area_avg_px=class_group["area_px"].mean(),
495
- area_min_px=int(class_group["area_px"].min()),
496
- area_max_px=int(class_group["area_px"].max()),
497
- height_avg_px=class_group["height_px"].mean(),
498
- height_min_px=int(class_group["height_px"].min()),
499
- height_max_px=int(class_group["height_px"].max()),
500
- width_avg_px=class_group["width_px"].mean(),
501
- width_min_px=int(class_group["width_px"].min()),
502
- width_max_px=int(class_group["width_px"].max()),
503
- average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
504
- )
505
- )
506
-
507
- if has_primitive(dataset_split, PrimitiveType=Classification):
508
- annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
509
- col_name = Classification.column_name()
510
- classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
511
- has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
512
- if has_classification_data:
513
- classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
514
-
515
- # Include only classification tasks that are defined in the dataset info
516
- classification_df = classification_df.filter(
517
- pl.col(PrimitiveField.TASK_NAME).is_in(classification_tasks)
518
- )
519
-
520
- for (
521
- task_name,
522
- class_name,
523
- ), class_group in classification_df.group_by(PrimitiveField.TASK_NAME, PrimitiveField.CLASS_NAME):
524
- if class_name is None:
525
- continue
526
- if task_name == Classification.default_task_name():
527
- display_name = class_name # Prefix class name with task name
528
- else:
529
- display_name = f"{task_name}.{class_name}"
530
- object_reports.append(
531
- DbAnnotatedObjectReport(
532
- obj=DbAnnotatedObject(
533
- name=display_name,
534
- entity_type=EntityTypeChoices.EVENT.value,
535
- annotation_type=annotation_type,
536
- task_name=task_name,
537
- ),
538
- unique_obj_ids=len(
539
- class_group
540
- ), # Unique object IDs are not applicable for classification
541
- obj_instances=len(class_group),
542
- annotation_type=[annotation_type],
543
- )
544
- )
545
-
546
- if has_primitive(dataset_split, PrimitiveType=Segmentation):
547
- raise NotImplementedError("Not Implemented yet")
548
-
549
- if has_primitive(dataset_split, PrimitiveType=Bitmask):
550
- col_name = Bitmask.column_name()
551
- drop_columns = [col for col in primitive_columns if col != col_name]
552
- drop_columns.append(PrimitiveField.META)
553
-
554
- df_per_instance = table_transformations.create_primitive_table(
555
- dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
556
- )
557
- if df_per_instance is None:
558
- raise ValueError(
559
- f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
560
- )
561
- df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
562
- df_per_instance = df_per_instance.with_columns(
563
- (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
564
- (pl.col("height_px") / pl.col("image.height")).alias("height"),
565
- (pl.col("width_px") / pl.col("image.width")).alias("width"),
566
- )
567
-
568
- annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
569
- for (class_name, task_name), class_group in df_per_instance.group_by(
570
- PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
571
- ):
572
- if class_name is None:
573
- continue
574
- object_reports.append(
575
- DbAnnotatedObjectReport(
576
- obj=DbAnnotatedObject(
577
- name=class_name,
578
- entity_type=EntityTypeChoices.OBJECT.value,
579
- annotation_type=annotation_type,
580
- task_name=task_name,
581
- ),
582
- unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
583
- obj_instances=len(class_group),
584
- annotation_type=[annotation_type],
585
- average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
586
- images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
587
- area_avg_ratio=class_group["area"].mean(),
588
- area_min_ratio=class_group["area"].min(),
589
- area_max_ratio=class_group["area"].max(),
590
- height_avg_ratio=class_group["height"].mean(),
591
- height_min_ratio=class_group["height"].min(),
592
- height_max_ratio=class_group["height"].max(),
593
- width_avg_ratio=class_group["width"].mean(),
594
- width_min_ratio=class_group["width"].min(),
595
- width_max_ratio=class_group["width"].max(),
596
- area_avg_px=class_group["area_px"].mean(),
597
- area_min_px=int(class_group["area_px"].min()),
598
- area_max_px=int(class_group["area_px"].max()),
599
- height_avg_px=class_group["height_px"].mean(),
600
- height_min_px=int(class_group["height_px"].min()),
601
- height_max_px=int(class_group["height_px"].max()),
602
- width_avg_px=class_group["width_px"].mean(),
603
- width_min_px=int(class_group["width_px"].min()),
604
- width_max_px=int(class_group["width_px"].max()),
605
- )
606
- )
607
-
608
- if has_primitive(dataset_split, PrimitiveType=Polygon):
609
- raise NotImplementedError("Not Implemented yet")
418
+ for PrimitiveType in [Classification, Bbox, Bitmask, Polygon, Segmentation]:
419
+ object_reports.extend(create_reports_from_primitive(dataset_split, PrimitiveType=PrimitiveType)) # type: ignore[type-abstract]
610
420
 
611
421
  # Sort object reports by name to more easily compare between versions
612
422
  object_reports = sorted(object_reports, key=lambda x: x.obj.name) # Sort object reports by name
@@ -617,14 +427,14 @@ def dataset_info_from_dataset(
617
427
 
618
428
  dataset_reports.append(report)
619
429
  dataset_name = dataset.info.dataset_name
620
- bucket_sample = generate_bucket_name(dataset_name, deployment_stage=deployment_stage)
621
- dataset_info = DbDataset(
430
+ dataset_info = DatasetDetails(
622
431
  name=dataset_name,
432
+ title=dataset.info.dataset_title,
433
+ overview=dataset.info.description,
623
434
  version=dataset.info.version,
624
- s3_bucket_name=bucket_sample,
625
435
  dataset_variants=dataset_variants,
626
436
  split_annotations_reports=dataset_reports,
627
- latest_update=dataset.info.updated_at,
437
+ dataset_updated_at=dataset.info.updated_at,
628
438
  dataset_format_version=dataset.info.format_version,
629
439
  license_citation=dataset.info.reference_bibtex,
630
440
  data_captured_start=dataset_meta_info.get("data_captured_start", None),
@@ -639,6 +449,101 @@ def dataset_info_from_dataset(
639
449
  return dataset_info
640
450
 
641
451
 
452
+ def create_reports_from_primitive(
453
+ dataset_split: pl.DataFrame, PrimitiveType: Type[Primitive]
454
+ ) -> List[DbAnnotatedObjectReport]:
455
+ if not table_transformations.has_primitive(dataset_split, PrimitiveType=PrimitiveType):
456
+ return []
457
+
458
+ if PrimitiveType == Segmentation:
459
+ raise NotImplementedError("Not Implemented yet")
460
+
461
+ df_per_instance = table_transformations.create_primitive_table(
462
+ dataset_split, PrimitiveType=PrimitiveType, keep_sample_data=True
463
+ )
464
+ if df_per_instance is None:
465
+ raise ValueError(f"Expected {PrimitiveType.__name__} primitive column to be present in the dataset split.")
466
+
467
+ entity_type = EntityTypeChoices.OBJECT.value
468
+ if PrimitiveType == Classification:
469
+ entity_type = EntityTypeChoices.EVENT.value
470
+
471
+ if PrimitiveType == Bbox:
472
+ df_per_instance = df_per_instance.with_columns(area=pl.col("height") * pl.col("width"))
473
+
474
+ if PrimitiveType == Bitmask:
475
+ # width and height are in pixel format for Bitmask convert to ratio
476
+ df_per_instance = df_per_instance.with_columns(
477
+ width=pl.col("width") / pl.col("image.width"),
478
+ height=pl.col("height") / pl.col("image.height"),
479
+ )
480
+
481
+ has_height_field = "height" in df_per_instance.columns and df_per_instance["height"].dtype != pl.Null
482
+ if has_height_field:
483
+ df_per_instance = df_per_instance.with_columns(
484
+ height_px=pl.col("height") * pl.col("image.height"),
485
+ )
486
+
487
+ has_width_field = "width" in df_per_instance.columns and df_per_instance["width"].dtype != pl.Null
488
+ if has_width_field:
489
+ df_per_instance = df_per_instance.with_columns(
490
+ width_px=pl.col("width") * pl.col("image.width"),
491
+ )
492
+
493
+ has_area_field = "area" in df_per_instance.columns and df_per_instance["area"].dtype != pl.Null
494
+ if has_area_field:
495
+ df_per_instance = df_per_instance.with_columns(
496
+ area_px=pl.col("image.height") * pl.col("image.width") * pl.col("area")
497
+ )
498
+ object_reports: List[DbAnnotatedObjectReport] = []
499
+ annotation_type = DbAnnotationType(name=PrimitiveType.__name__)
500
+ for (class_name, task_name), class_group in df_per_instance.group_by(
501
+ PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
502
+ ):
503
+ if class_name is None:
504
+ continue
505
+
506
+ object_report = DbAnnotatedObjectReport(
507
+ obj=DbAnnotatedObject(
508
+ name=class_name,
509
+ entity_type=entity_type,
510
+ annotation_type=annotation_type,
511
+ task_name=task_name,
512
+ ),
513
+ unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
514
+ obj_instances=len(class_group),
515
+ annotation_type=[annotation_type],
516
+ average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
517
+ images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
518
+ )
519
+ if has_height_field:
520
+ object_report.height_avg_ratio = class_group["height"].mean()
521
+ object_report.height_min_ratio = class_group["height"].min()
522
+ object_report.height_max_ratio = class_group["height"].max()
523
+ object_report.height_avg_px = class_group["height_px"].mean()
524
+ object_report.height_min_px = int(class_group["height_px"].min())
525
+ object_report.height_max_px = int(class_group["height_px"].max())
526
+
527
+ if has_width_field:
528
+ object_report.width_avg_ratio = class_group["width"].mean()
529
+ object_report.width_min_ratio = class_group["width"].min()
530
+ object_report.width_max_ratio = class_group["width"].max()
531
+ object_report.width_avg_px = class_group["width_px"].mean()
532
+ object_report.width_min_px = int(class_group["width_px"].min())
533
+ object_report.width_max_px = int(class_group["width_px"].max())
534
+
535
+ if has_area_field:
536
+ object_report.area_avg_ratio = class_group["area"].mean()
537
+ object_report.area_min_ratio = class_group["area"].min()
538
+ object_report.area_max_ratio = class_group["area"].max()
539
+ object_report.area_avg_px = class_group["area_px"].mean()
540
+ object_report.area_min_px = int(class_group["area_px"].min())
541
+ object_report.area_max_px = int(class_group["area_px"].max())
542
+
543
+ object_reports.append(object_report)
544
+ return object_reports
545
+
546
+
642
547
  def create_gallery_images(
643
548
  dataset: HafniaDataset,
644
549
  path_gallery_images: Optional[Path],
@@ -647,7 +552,7 @@ def create_gallery_images(
647
552
  gallery_images = None
648
553
  if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
649
554
  if path_gallery_images is None:
650
- raise ValueError("Path to gallery images must be provided.")
555
+ path_gallery_images = get_path_dataset_gallery_images(dataset.info.dataset_name)
651
556
  path_gallery_images.mkdir(parents=True, exist_ok=True)
652
557
  COL_IMAGE_NAME = "image_name"
653
558
  samples = dataset.samples.with_columns(
@@ -657,7 +562,12 @@ def create_gallery_images(
657
562
 
658
563
  missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
659
564
  if len(missing_gallery_samples):
660
- raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
565
+ potential_samples = samples[COL_IMAGE_NAME].sort().to_list()
566
+ formatted_samples = ", ".join([f'"{s}"' for s in potential_samples[:9]])
567
+ raise ValueError(
568
+ f"Gallery images not found in dataset: {missing_gallery_samples}. "
569
+ f"Consider adding this to dataset definition: \ngallery_image_names=[{formatted_samples}]"
570
+ )
661
571
  gallery_images = []
662
572
  for gallery_sample in gallery_samples.iter_rows(named=True):
663
573
  sample = Sample(**gallery_sample)
@@ -57,20 +57,6 @@ def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_s
57
57
  def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
58
58
  """
59
59
  Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
60
-
61
- E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
62
- 'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
63
- Notice that the hash is used for both the filename and the subfolder name.
64
-
65
- Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
66
- unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
67
-
68
- The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
69
- download 3500 files per second from a single folder (prefix) in S3.
70
-
71
- For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
72
- in S3. To support multiple users and concurrent experiments, we are required to separate files into
73
- multiple sub-folders (prefixes) in S3 to not hit the rate limit.
74
60
  """
75
61
 
76
62
  if not path_source.exists():
@@ -86,7 +72,7 @@ def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Pat
86
72
 
87
73
 
88
74
  def relative_path_from_hash(hash: str, suffix: str) -> Path:
89
- path_file = Path("data") / hash[:3] / f"{hash}{suffix}"
75
+ path_file = Path("data") / f"{hash}{suffix}"
90
76
  return path_file
91
77
 
92
78
 
@@ -2,6 +2,7 @@ from enum import Enum
2
2
  from typing import Dict, List, Optional
3
3
 
4
4
  import boto3
5
+ from botocore.exceptions import UnauthorizedSSOTokenError
5
6
  from pydantic import BaseModel, field_validator
6
7
 
7
8
  FILENAME_RECIPE_JSON = "recipe.json"
@@ -21,6 +22,7 @@ class DeploymentStage(Enum):
21
22
  PRODUCTION = "production"
22
23
 
23
24
 
25
+ ARN_PREFIX = "arn:aws:s3:::"
24
26
  TAG_IS_SAMPLE = "sample"
25
27
 
26
28
  OPS_REMOVE_CLASS = "__REMOVE__"
@@ -93,6 +95,32 @@ class SplitName:
93
95
  def all_split_names() -> List[str]:
94
96
  return [*SplitName.valid_splits(), SplitName.UNDEFINED]
95
97
 
98
+ @staticmethod
99
+ def map_split_name(potential_split_name: str, strict: bool = True) -> str:
100
+ normalized = potential_split_name.strip().lower()
101
+
102
+ if normalized in SPLIT_NAME_MAPPINGS:
103
+ return SPLIT_NAME_MAPPINGS[normalized]
104
+
105
+ if strict:
106
+ raise ValueError(f"Unrecognized split name: {potential_split_name}")
107
+ else:
108
+ return SplitName.UNDEFINED
109
+
110
+
111
+ SPLIT_NAME_MAPPINGS = {
112
+ # Train variations
113
+ "train": SplitName.TRAIN,
114
+ "training": SplitName.TRAIN,
115
+ # Validation variations
116
+ "validation": SplitName.VAL,
117
+ "val": SplitName.VAL,
118
+ "valid": SplitName.VAL,
119
+ # Test variations
120
+ "test": SplitName.TEST,
121
+ "testing": SplitName.TEST,
122
+ }
123
+
96
124
 
97
125
  class DatasetVariant(Enum):
98
126
  DUMP = "dump"
@@ -125,7 +153,14 @@ class AwsCredentials(BaseModel):
125
153
  """
126
154
  Creates AwsCredentials from a Boto3 session.
127
155
  """
128
- frozen_credentials = session.get_credentials().get_frozen_credentials()
156
+ try:
157
+ frozen_credentials = session.get_credentials().get_frozen_credentials()
158
+ except UnauthorizedSSOTokenError as e:
159
+ raise RuntimeError(
160
+ f"Failed to get AWS credentials from the session for profile '{session.profile_name}'.\n"
161
+ f"Ensure the profile exists in your AWS config in '~/.aws/config' and that you are logged in via AWS SSO.\n"
162
+ f"\tUse 'aws sso login --profile {session.profile_name}' to log in."
163
+ ) from e
129
164
  return AwsCredentials(
130
165
  access_key=frozen_credentials.access_key,
131
166
  secret_key=frozen_credentials.secret_key,
@@ -133,8 +168,13 @@ class AwsCredentials(BaseModel):
133
168
  region=session.region_name,
134
169
  )
135
170
 
136
-
137
- ARN_PREFIX = "arn:aws:s3:::"
171
+ def to_resource_credentials(self, bucket_name: str) -> "ResourceCredentials":
172
+ """
173
+ Converts AwsCredentials to ResourceCredentials by adding the S3 ARN.
174
+ """
175
+ payload = self.model_dump()
176
+ payload["s3_arn"] = f"{ARN_PREFIX}{bucket_name}"
177
+ return ResourceCredentials(**payload)
138
178
 
139
179
 
140
180
  class ResourceCredentials(AwsCredentials):