PyPI - hafnia - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

hafnia 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} +148 -238
hafnia/dataset/dataset_helpers.py +1 -15
hafnia/dataset/dataset_names.py +43 -3
hafnia/dataset/format_conversions/format_coco.py +490 -0
hafnia/dataset/format_conversions/format_helpers.py +33 -0
hafnia/dataset/format_conversions/format_image_classification_folder.py +95 -14
hafnia/dataset/format_conversions/format_yolo.py +115 -25
hafnia/dataset/format_conversions/torchvision_datasets.py +16 -11
hafnia/dataset/hafnia_dataset.py +119 -490
hafnia/dataset/hafnia_dataset_types.py +479 -0
hafnia/dataset/license_types.py +4 -4
hafnia/dataset/operations/dataset_s3_storage.py +211 -0
hafnia/dataset/operations/dataset_stats.py +3 -3
hafnia/dataset/operations/dataset_transformations.py +14 -17
hafnia/dataset/operations/table_transformations.py +22 -14
hafnia/dataset/primitives/bbox.py +6 -2
hafnia/dataset/primitives/bitmask.py +21 -46
hafnia/dataset/primitives/classification.py +1 -1
hafnia/dataset/primitives/polygon.py +43 -2
hafnia/dataset/primitives/primitive.py +1 -1
hafnia/dataset/primitives/segmentation.py +1 -1
hafnia/experiment/hafnia_logger.py +13 -4
hafnia/http.py +2 -1
hafnia/platform/datasets.py +195 -105
hafnia/platform/s5cmd_utils.py +147 -0
hafnia/torch_helpers.py +48 -4
hafnia/utils.py +38 -0
hafnia/visualizations/image_visualizations.py +3 -1
{hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/METADATA +4 -4
hafnia-0.5.0.dist-info/RECORD +62 -0
{hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/WHEEL +1 -1
hafnia_cli/dataset_cmds.py +18 -0
hafnia_cli/profile_cmds.py +0 -1
hafnia-0.4.2.dist-info/RECORD +0 -57
{hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} RENAMED Viewed

@@ -4,22 +4,21 @@ import base64
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Type, Union
 import boto3
 import polars as pl
 from PIL import Image
 from pydantic import BaseModel, ConfigDict, field_validator
-from hafnia.dataset import primitives
 from hafnia.dataset.dataset_names import (
     DatasetVariant,
-    DeploymentStage,
     PrimitiveField,
     SampleField,
     SplitName,
 )
-from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.hafnia_dataset import HafniaDataset
+from hafnia.dataset.hafnia_dataset_types import Attribution, Sample, TaskInfo
 from hafnia.dataset.operations import table_transformations
 from hafnia.dataset.primitives import (
     Bbox,
@@ -29,26 +28,21 @@ from hafnia.dataset.primitives import (
     Segmentation,
 )
 from hafnia.dataset.primitives.primitive import Primitive
-from hafnia.http import post
-from hafnia.log import user_logger
-from hafnia.platform.datasets import get_dataset_id
+from hafnia.platform.datasets import upload_dataset_details
+from hafnia.utils import get_path_dataset_gallery_images
 from hafnia_cli.config import Config
-def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
-    # TODO: When moving to versioning we do NOT need 'staging' and 'production' specific buckets
-    # and the new name convention should be: f"hafnia-dataset-{dataset_name}"
-    return f"mdi-{deployment_stage.value}-{dataset_name}"
-class DbDataset(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
+class DatasetDetails(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     model_config = ConfigDict(use_enum_values=True)  # To parse Enum values as strings
     name: str
+    title: Optional[str] = None
+    overview: Optional[str] = None
     data_captured_start: Optional[datetime] = None
     data_captured_end: Optional[datetime] = None
     data_received_start: Optional[datetime] = None
     data_received_end: Optional[datetime] = None
-    latest_update: Optional[datetime] = None
+    dataset_updated_at: Optional[datetime] = None
     license_citation: Optional[str] = None
     version: Optional[str] = None
     s3_bucket_name: Optional[str] = None
@@ -150,14 +144,6 @@ class DbAnnotationType(BaseModel, validate_assignment=True):  # type: ignore[cal
     name: str
-class AnnotationType(Enum):
-    ImageClassification = "Image Classification"
-    ObjectDetection = "Object Detection"
-    SegmentationMask = "Segmentation Mask"
-    ImageCaptioning = "Image Captioning"
-    InstanceSegmentation = "Instance Segmentation"
 class DbResolution(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     height: int
     width: int
@@ -289,26 +275,32 @@ def get_folder_size(path: Path) -> int:
     return sum([path.stat().st_size for path in path.rglob("*")])
-def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
-    if not upload_gallery_images:
-        dataset_update.imgs = None
-    cfg = Config()
-    dataset_details = dataset_update.model_dump_json()
-    data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
-    return data
-def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
-    dataset_endpoint = cfg.get_platform_endpoint("datasets")
-    dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
+def upload_dataset_details_to_platform(
+    dataset: HafniaDataset,
+    path_gallery_images: Optional[Path] = None,
+    gallery_image_names: Optional[List[str]] = None,
+    distribution_task_names: Optional[List[str]] = None,
+    update_platform: bool = True,
+    cfg: Optional[Config] = None,
+) -> dict:
+    cfg = cfg or Config()
+    dataset_details = dataset_details_from_hafnia_dataset(
+        dataset=dataset,
+        path_gallery_images=path_gallery_images,
+        gallery_image_names=gallery_image_names,
+        distribution_task_names=distribution_task_names,
+    )
-    import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
-    headers = {"Authorization": cfg.api_key}
+    if update_platform:
+        dataset_details_exclude_none = dataset_details.model_dump(exclude_none=True, mode="json")
+        upload_dataset_details(
+            cfg=cfg,
+            data=dataset_details_exclude_none,
+            dataset_name=dataset_details.name,
+        )
-    user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
-    response = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
-    return response  # type: ignore[return-value]
+    dataset_details_dict = dataset_details.model_dump(exclude_none=False, mode="json")
+    return dataset_details_dict
 def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -322,18 +314,6 @@ def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -
     return resolutions
-def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Type[Primitive]) -> bool:
-    col_name = PrimitiveType.column_name()
-    table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
-    if col_name not in table.columns:
-        return False
-    if table[col_name].dtype == pl.Null:
-        return False
-    return True
 def calculate_distribution_values(
     dataset_split: pl.DataFrame, distribution_tasks: Optional[List[TaskInfo]]
 ) -> List[DbDistributionValue]:
@@ -378,46 +358,34 @@ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: bot
     return last_modified, size
-def dataset_info_from_dataset(
+def dataset_details_from_hafnia_dataset(
     dataset: HafniaDataset,
-    deployment_stage: DeploymentStage,
-    path_sample: Optional[Path],
-    path_hidden: Optional[Path],
     path_gallery_images: Optional[Path] = None,
     gallery_image_names: Optional[List[str]] = None,
-    distribution_task_names: Optional[List[TaskInfo]] = None,
-) -> DbDataset:
+    distribution_task_names: Optional[List[str]] = None,
+) -> DatasetDetails:
     dataset_variants = []
     dataset_reports = []
     dataset_meta_info = dataset.info.meta or {}
-    path_and_variant: List[Tuple[Path, DatasetVariant]] = []
-    if path_sample is not None:
-        path_and_variant.append((path_sample, DatasetVariant.SAMPLE))
-    if path_hidden is not None:
-        path_and_variant.append((path_hidden, DatasetVariant.HIDDEN))
-    if len(path_and_variant) == 0:
-        raise ValueError("At least one path must be provided for sample or hidden dataset.")
+    path_and_variant = [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]
     gallery_images = create_gallery_images(
         dataset=dataset,
         path_gallery_images=path_gallery_images,
         gallery_image_names=gallery_image_names,
     )
-    for path_dataset, variant_type in path_and_variant:
+    for variant_type in path_and_variant:
         if variant_type == DatasetVariant.SAMPLE:
             dataset_variant = dataset.create_sample_dataset()
         else:
             dataset_variant = dataset
-        size_bytes = get_folder_size(path_dataset)
+        files_paths = dataset_variant.samples[SampleField.FILE_PATH].to_list()
+        size_bytes = sum([Path(file_path).stat().st_size for file_path in files_paths])
         dataset_variants.append(
             DbDatasetVariant(
                 variant_type=VARIANT_TYPE_MAPPING[variant_type],  # type: ignore[index]
-                # upload_date: Optional[datetime] = None
                 size_bytes=size_bytes,
                 data_type=DataTypeChoices.images,
                 number_of_data_items=len(dataset_variant),
@@ -425,7 +393,6 @@ def dataset_info_from_dataset(
                 duration=dataset_meta_info.get("duration", None),
                 duration_average=dataset_meta_info.get("duration_average", None),
                 frame_rate=dataset_meta_info.get("frame_rate", None),
-                # bit_rate: Optional[float] = None
                 n_cameras=dataset_meta_info.get("n_cameras", None),
             )
         )
@@ -448,165 +415,8 @@ def dataset_info_from_dataset(
             )
             object_reports: List[DbAnnotatedObjectReport] = []
-            primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
-            if has_primitive(dataset_split, PrimitiveType=Bbox):
-                df_per_instance = table_transformations.create_primitive_table(
-                    dataset_split, PrimitiveType=Bbox, keep_sample_data=True
-                )
-                if df_per_instance is None:
-                    raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
-                # Calculate area of bounding boxes
-                df_per_instance = df_per_instance.with_columns(
-                    (pl.col("height") * pl.col("width")).alias("area"),
-                ).with_columns(
-                    (pl.col("height") * pl.col("image.height")).alias("height_px"),
-                    (pl.col("width") * pl.col("image.width")).alias("width_px"),
-                    (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
-                )
-                annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
-                for (class_name, task_name), class_group in df_per_instance.group_by(
-                    PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
-                ):
-                    if class_name is None:
-                        continue
-                    object_reports.append(
-                        DbAnnotatedObjectReport(
-                            obj=DbAnnotatedObject(
-                                name=class_name,
-                                entity_type=EntityTypeChoices.OBJECT.value,
-                                annotation_type=annotation_type,
-                                task_name=task_name,
-                            ),
-                            unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
-                            obj_instances=len(class_group),
-                            annotation_type=[annotation_type],
-                            images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
-                            area_avg_ratio=class_group["area"].mean(),
-                            area_min_ratio=class_group["area"].min(),
-                            area_max_ratio=class_group["area"].max(),
-                            height_avg_ratio=class_group["height"].mean(),
-                            height_min_ratio=class_group["height"].min(),
-                            height_max_ratio=class_group["height"].max(),
-                            width_avg_ratio=class_group["width"].mean(),
-                            width_min_ratio=class_group["width"].min(),
-                            width_max_ratio=class_group["width"].max(),
-                            area_avg_px=class_group["area_px"].mean(),
-                            area_min_px=int(class_group["area_px"].min()),
-                            area_max_px=int(class_group["area_px"].max()),
-                            height_avg_px=class_group["height_px"].mean(),
-                            height_min_px=int(class_group["height_px"].min()),
-                            height_max_px=int(class_group["height_px"].max()),
-                            width_avg_px=class_group["width_px"].mean(),
-                            width_min_px=int(class_group["width_px"].min()),
-                            width_max_px=int(class_group["width_px"].max()),
-                            average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
-                        )
-                    )
-            if has_primitive(dataset_split, PrimitiveType=Classification):
-                annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
-                col_name = Classification.column_name()
-                classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
-                has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
-                if has_classification_data:
-                    classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
-                    # Include only classification tasks that are defined in the dataset info
-                    classification_df = classification_df.filter(
-                        pl.col(PrimitiveField.TASK_NAME).is_in(classification_tasks)
-                    )
-                    for (
-                        task_name,
-                        class_name,
-                    ), class_group in classification_df.group_by(PrimitiveField.TASK_NAME, PrimitiveField.CLASS_NAME):
-                        if class_name is None:
-                            continue
-                        if task_name == Classification.default_task_name():
-                            display_name = class_name  # Prefix class name with task name
-                        else:
-                            display_name = f"{task_name}.{class_name}"
-                        object_reports.append(
-                            DbAnnotatedObjectReport(
-                                obj=DbAnnotatedObject(
-                                    name=display_name,
-                                    entity_type=EntityTypeChoices.EVENT.value,
-                                    annotation_type=annotation_type,
-                                    task_name=task_name,
-                                ),
-                                unique_obj_ids=len(
-                                    class_group
-                                ),  # Unique object IDs are not applicable for classification
-                                obj_instances=len(class_group),
-                                annotation_type=[annotation_type],
-                            )
-                        )
-            if has_primitive(dataset_split, PrimitiveType=Segmentation):
-                raise NotImplementedError("Not Implemented yet")
-            if has_primitive(dataset_split, PrimitiveType=Bitmask):
-                col_name = Bitmask.column_name()
-                drop_columns = [col for col in primitive_columns if col != col_name]
-                drop_columns.append(PrimitiveField.META)
-                df_per_instance = table_transformations.create_primitive_table(
-                    dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
-                )
-                if df_per_instance is None:
-                    raise ValueError(
-                        f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
-                    )
-                df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
-                df_per_instance = df_per_instance.with_columns(
-                    (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
-                    (pl.col("height_px") / pl.col("image.height")).alias("height"),
-                    (pl.col("width_px") / pl.col("image.width")).alias("width"),
-                )
-                annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
-                for (class_name, task_name), class_group in df_per_instance.group_by(
-                    PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
-                ):
-                    if class_name is None:
-                        continue
-                    object_reports.append(
-                        DbAnnotatedObjectReport(
-                            obj=DbAnnotatedObject(
-                                name=class_name,
-                                entity_type=EntityTypeChoices.OBJECT.value,
-                                annotation_type=annotation_type,
-                                task_name=task_name,
-                            ),
-                            unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
-                            obj_instances=len(class_group),
-                            annotation_type=[annotation_type],
-                            average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
-                            images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
-                            area_avg_ratio=class_group["area"].mean(),
-                            area_min_ratio=class_group["area"].min(),
-                            area_max_ratio=class_group["area"].max(),
-                            height_avg_ratio=class_group["height"].mean(),
-                            height_min_ratio=class_group["height"].min(),
-                            height_max_ratio=class_group["height"].max(),
-                            width_avg_ratio=class_group["width"].mean(),
-                            width_min_ratio=class_group["width"].min(),
-                            width_max_ratio=class_group["width"].max(),
-                            area_avg_px=class_group["area_px"].mean(),
-                            area_min_px=int(class_group["area_px"].min()),
-                            area_max_px=int(class_group["area_px"].max()),
-                            height_avg_px=class_group["height_px"].mean(),
-                            height_min_px=int(class_group["height_px"].min()),
-                            height_max_px=int(class_group["height_px"].max()),
-                            width_avg_px=class_group["width_px"].mean(),
-                            width_min_px=int(class_group["width_px"].min()),
-                            width_max_px=int(class_group["width_px"].max()),
-                        )
-                    )
-            if has_primitive(dataset_split, PrimitiveType=Polygon):
-                raise NotImplementedError("Not Implemented yet")
+            for PrimitiveType in [Classification, Bbox, Bitmask, Polygon, Segmentation]:
+                object_reports.extend(create_reports_from_primitive(dataset_split, PrimitiveType=PrimitiveType))  # type: ignore[type-abstract]
             # Sort object reports by name to more easily compare between versions
             object_reports = sorted(object_reports, key=lambda x: x.obj.name)  # Sort object reports by name
@@ -617,14 +427,14 @@ def dataset_info_from_dataset(
             dataset_reports.append(report)
     dataset_name = dataset.info.dataset_name
-    bucket_sample = generate_bucket_name(dataset_name, deployment_stage=deployment_stage)
-    dataset_info = DbDataset(
+    dataset_info = DatasetDetails(
         name=dataset_name,
+        title=dataset.info.dataset_title,
+        overview=dataset.info.description,
         version=dataset.info.version,
-        s3_bucket_name=bucket_sample,
         dataset_variants=dataset_variants,
         split_annotations_reports=dataset_reports,
-        latest_update=dataset.info.updated_at,
+        dataset_updated_at=dataset.info.updated_at,
         dataset_format_version=dataset.info.format_version,
         license_citation=dataset.info.reference_bibtex,
         data_captured_start=dataset_meta_info.get("data_captured_start", None),
@@ -639,6 +449,101 @@ def dataset_info_from_dataset(
     return dataset_info
+def create_reports_from_primitive(
+    dataset_split: pl.DataFrame, PrimitiveType: Type[Primitive]
+) -> List[DbAnnotatedObjectReport]:
+    if not table_transformations.has_primitive(dataset_split, PrimitiveType=PrimitiveType):
+        return []
+    if PrimitiveType == Segmentation:
+        raise NotImplementedError("Not Implemented yet")
+    df_per_instance = table_transformations.create_primitive_table(
+        dataset_split, PrimitiveType=PrimitiveType, keep_sample_data=True
+    )
+    if df_per_instance is None:
+        raise ValueError(f"Expected {PrimitiveType.__name__} primitive column to be present in the dataset split.")
+    entity_type = EntityTypeChoices.OBJECT.value
+    if PrimitiveType == Classification:
+        entity_type = EntityTypeChoices.EVENT.value
+    if PrimitiveType == Bbox:
+        df_per_instance = df_per_instance.with_columns(area=pl.col("height") * pl.col("width"))
+    if PrimitiveType == Bitmask:
+        # width and height are in pixel format for Bitmask convert to ratio
+        df_per_instance = df_per_instance.with_columns(
+            width=pl.col("width") / pl.col("image.width"),
+            height=pl.col("height") / pl.col("image.height"),
+        )
+    has_height_field = "height" in df_per_instance.columns and df_per_instance["height"].dtype != pl.Null
+    if has_height_field:
+        df_per_instance = df_per_instance.with_columns(
+            height_px=pl.col("height") * pl.col("image.height"),
+        )
+    has_width_field = "width" in df_per_instance.columns and df_per_instance["width"].dtype != pl.Null
+    if has_width_field:
+        df_per_instance = df_per_instance.with_columns(
+            width_px=pl.col("width") * pl.col("image.width"),
+        )
+    has_area_field = "area" in df_per_instance.columns and df_per_instance["area"].dtype != pl.Null
+    if has_area_field:
+        df_per_instance = df_per_instance.with_columns(
+            area_px=pl.col("image.height") * pl.col("image.width") * pl.col("area")
+        )
+    object_reports: List[DbAnnotatedObjectReport] = []
+    annotation_type = DbAnnotationType(name=PrimitiveType.__name__)
+    for (class_name, task_name), class_group in df_per_instance.group_by(
+        PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
+    ):
+        if class_name is None:
+            continue
+        object_report = DbAnnotatedObjectReport(
+            obj=DbAnnotatedObject(
+                name=class_name,
+                entity_type=entity_type,
+                annotation_type=annotation_type,
+                task_name=task_name,
+            ),
+            unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
+            obj_instances=len(class_group),
+            annotation_type=[annotation_type],
+            average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
+            images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
+        )
+        if has_height_field:
+            object_report.height_avg_ratio = class_group["height"].mean()
+            object_report.height_min_ratio = class_group["height"].min()
+            object_report.height_max_ratio = class_group["height"].max()
+            object_report.height_avg_px = class_group["height_px"].mean()
+            object_report.height_min_px = int(class_group["height_px"].min())
+            object_report.height_max_px = int(class_group["height_px"].max())
+        if has_width_field:
+            object_report.width_avg_ratio = class_group["width"].mean()
+            object_report.width_min_ratio = class_group["width"].min()
+            object_report.width_max_ratio = class_group["width"].max()
+            object_report.width_avg_px = class_group["width_px"].mean()
+            object_report.width_min_px = int(class_group["width_px"].min())
+            object_report.width_max_px = int(class_group["width_px"].max())
+        if has_area_field:
+            object_report.area_avg_ratio = class_group["area"].mean()
+            object_report.area_min_ratio = class_group["area"].min()
+            object_report.area_max_ratio = class_group["area"].max()
+            object_report.area_avg_px = class_group["area_px"].mean()
+            object_report.area_min_px = int(class_group["area_px"].min())
+            object_report.area_max_px = int(class_group["area_px"].max())
+        object_reports.append(object_report)
+    return object_reports
 def create_gallery_images(
     dataset: HafniaDataset,
     path_gallery_images: Optional[Path],
@@ -647,7 +552,7 @@ def create_gallery_images(
     gallery_images = None
     if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
         if path_gallery_images is None:
-            raise ValueError("Path to gallery images must be provided.")
+            path_gallery_images = get_path_dataset_gallery_images(dataset.info.dataset_name)
         path_gallery_images.mkdir(parents=True, exist_ok=True)
         COL_IMAGE_NAME = "image_name"
         samples = dataset.samples.with_columns(
@@ -657,7 +562,12 @@ def create_gallery_images(
         missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
         if len(missing_gallery_samples):
-            raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
+            potential_samples = samples[COL_IMAGE_NAME].sort().to_list()
+            formatted_samples = ", ".join([f'"{s}"' for s in potential_samples[:9]])
+            raise ValueError(
+                f"Gallery images not found in dataset: {missing_gallery_samples}. "
+                f"Consider adding this to dataset definition: \ngallery_image_names=[{formatted_samples}]"
+            )
         gallery_images = []
         for gallery_sample in gallery_samples.iter_rows(named=True):
             sample = Sample(**gallery_sample)

hafnia/dataset/dataset_helpers.py CHANGED Viewed

@@ -57,20 +57,6 @@ def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_s
 def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
     """
     Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
-    E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
-    'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
-    Notice that the hash is used for both the filename and the subfolder name.
-    Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
-    unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
-    The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
-    download 3500 files per second from a single folder (prefix) in S3.
-    For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
-    in S3. To support multiple users and concurrent experiments, we are required to separate files into
-    multiple sub-folders (prefixes) in S3 to not hit the rate limit.
     """
     if not path_source.exists():
@@ -86,7 +72,7 @@ def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Pat
 def relative_path_from_hash(hash: str, suffix: str) -> Path:
-    path_file = Path("data") / hash[:3] / f"{hash}{suffix}"
+    path_file = Path("data") / f"{hash}{suffix}"
     return path_file

hafnia/dataset/dataset_names.py CHANGED Viewed

@@ -2,6 +2,7 @@ from enum import Enum
 from typing import Dict, List, Optional
 import boto3
+from botocore.exceptions import UnauthorizedSSOTokenError
 from pydantic import BaseModel, field_validator
 FILENAME_RECIPE_JSON = "recipe.json"
@@ -21,6 +22,7 @@ class DeploymentStage(Enum):
     PRODUCTION = "production"
+ARN_PREFIX = "arn:aws:s3:::"
 TAG_IS_SAMPLE = "sample"
 OPS_REMOVE_CLASS = "__REMOVE__"
@@ -93,6 +95,32 @@ class SplitName:
     def all_split_names() -> List[str]:
         return [*SplitName.valid_splits(), SplitName.UNDEFINED]
+    @staticmethod
+    def map_split_name(potential_split_name: str, strict: bool = True) -> str:
+        normalized = potential_split_name.strip().lower()
+        if normalized in SPLIT_NAME_MAPPINGS:
+            return SPLIT_NAME_MAPPINGS[normalized]
+        if strict:
+            raise ValueError(f"Unrecognized split name: {potential_split_name}")
+        else:
+            return SplitName.UNDEFINED
+SPLIT_NAME_MAPPINGS = {
+    # Train variations
+    "train": SplitName.TRAIN,
+    "training": SplitName.TRAIN,
+    # Validation variations
+    "validation": SplitName.VAL,
+    "val": SplitName.VAL,
+    "valid": SplitName.VAL,
+    # Test variations
+    "test": SplitName.TEST,
+    "testing": SplitName.TEST,
+}
 class DatasetVariant(Enum):
     DUMP = "dump"
@@ -125,7 +153,14 @@ class AwsCredentials(BaseModel):
         """
         Creates AwsCredentials from a Boto3 session.
         """
-        frozen_credentials = session.get_credentials().get_frozen_credentials()
+        try:
+            frozen_credentials = session.get_credentials().get_frozen_credentials()
+        except UnauthorizedSSOTokenError as e:
+            raise RuntimeError(
+                f"Failed to get AWS credentials from the session for profile '{session.profile_name}'.\n"
+                f"Ensure the profile exists in your AWS config in '~/.aws/config' and that you are logged in via AWS SSO.\n"
+                f"\tUse 'aws sso login --profile {session.profile_name}' to log in."
+            ) from e
         return AwsCredentials(
             access_key=frozen_credentials.access_key,
             secret_key=frozen_credentials.secret_key,
@@ -133,8 +168,13 @@ class AwsCredentials(BaseModel):
             region=session.region_name,
         )
-ARN_PREFIX = "arn:aws:s3:::"
+    def to_resource_credentials(self, bucket_name: str) -> "ResourceCredentials":
+        """
+        Converts AwsCredentials to ResourceCredentials by adding the S3 ARN.
+        """
+        payload = self.model_dump()
+        payload["s3_arn"] = f"{ARN_PREFIX}{bucket_name}"
+        return ResourceCredentials(**payload)
 class ResourceCredentials(AwsCredentials):

hafnia 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

hafnia 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl