PyPI - hafnia - Versions diffs - 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

cli/__main__.py +16 -3
cli/config.py +45 -4
cli/consts.py +1 -1
cli/dataset_cmds.py +6 -14
cli/dataset_recipe_cmds.py +78 -0
cli/experiment_cmds.py +226 -43
cli/keychain.py +88 -0
cli/profile_cmds.py +10 -6
cli/runc_cmds.py +5 -5
cli/trainer_package_cmds.py +65 -0
hafnia/__init__.py +2 -0
hafnia/data/factory.py +1 -2
hafnia/dataset/dataset_helpers.py +9 -14
hafnia/dataset/dataset_names.py +10 -5
hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
hafnia/dataset/dataset_upload_helper.py +265 -56
hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
hafnia/dataset/hafnia_dataset.py +577 -213
hafnia/dataset/license_types.py +63 -0
hafnia/dataset/operations/dataset_stats.py +259 -3
hafnia/dataset/operations/dataset_transformations.py +332 -7
hafnia/dataset/operations/table_transformations.py +43 -5
hafnia/dataset/primitives/__init__.py +8 -0
hafnia/dataset/primitives/bbox.py +25 -12
hafnia/dataset/primitives/bitmask.py +26 -14
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +16 -9
hafnia/dataset/primitives/segmentation.py +10 -7
hafnia/experiment/hafnia_logger.py +111 -8
hafnia/http.py +16 -2
hafnia/platform/__init__.py +9 -3
hafnia/platform/builder.py +12 -10
hafnia/platform/dataset_recipe.py +104 -0
hafnia/platform/datasets.py +47 -9
hafnia/platform/download.py +25 -19
hafnia/platform/experiment.py +51 -56
hafnia/platform/trainer_package.py +57 -0
hafnia/utils.py +81 -13
hafnia/visualizations/image_visualizations.py +4 -4
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
hafnia-0.4.0.dist-info/RECORD +56 -0
cli/recipe_cmds.py +0 -45
hafnia-0.2.4.dist-info/RECORD +0 -49
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/dataset_upload_helper.py CHANGED Viewed

@@ -1,19 +1,16 @@
 from __future__ import annotations
+import base64
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 import boto3
 import polars as pl
-from pydantic import BaseModel, ConfigDict
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, field_validator
-import hafnia.dataset.primitives.bbox
-import hafnia.dataset.primitives.bitmask
-import hafnia.dataset.primitives.classification
-import hafnia.dataset.primitives.polygon
-import hafnia.dataset.primitives.segmentation
 from cli.config import Config
 from hafnia.dataset import primitives
 from hafnia.dataset.dataset_names import (
@@ -23,11 +20,19 @@ from hafnia.dataset.dataset_names import (
     FieldName,
     SplitName,
 )
-from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
+from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.operations import table_transformations
+from hafnia.dataset.primitives import (
+    Bbox,
+    Bitmask,
+    Classification,
+    Polygon,
+    Segmentation,
+)
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.http import post
 from hafnia.log import user_logger
-from hafnia.platform import get_dataset_id
+from hafnia.platform.datasets import get_dataset_id
 def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
@@ -47,13 +52,14 @@ class DbDataset(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     license_citation: Optional[str] = None
     version: Optional[str] = None
     s3_bucket_name: Optional[str] = None
+    dataset_format_version: Optional[str] = None
     annotation_date: Optional[datetime] = None
     annotation_project_id: Optional[str] = None
     annotation_dataset_id: Optional[str] = None
     annotation_ontology: Optional[str] = None
     dataset_variants: Optional[List[DbDatasetVariant]] = None
     split_annotations_reports: Optional[List[DbSplitAnnotationsReport]] = None
-    dataset_images: Optional[List[DatasetImage]] = None
+    imgs: Optional[List[DatasetImage]] = None
 class DbDatasetVariant(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -75,6 +81,8 @@ class DbAnnotatedObject(BaseModel, validate_assignment=True):  # type: ignore[ca
     model_config = ConfigDict(use_enum_values=True)  # To parse Enum values as strings
     name: str
     entity_type: EntityTypeChoices
+    annotation_type: DbAnnotationType
+    task_name: Optional[str] = None  # Not sure if adding task_name makes sense.
 class DbAnnotatedObjectReport(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -82,10 +90,34 @@ class DbAnnotatedObjectReport(BaseModel, validate_assignment=True):  # type: ign
     obj: DbAnnotatedObject
     unique_obj_ids: Optional[int] = None
     obj_instances: Optional[int] = None
+    images_with_obj: Optional[int] = None
     average_count_per_image: Optional[float] = None
-    avg_area: Optional[float] = None
-    min_area: Optional[float] = None
-    max_area: Optional[float] = None
+    area_avg_ratio: Optional[float] = None
+    area_min_ratio: Optional[float] = None
+    area_max_ratio: Optional[float] = None
+    height_avg_ratio: Optional[float] = None
+    height_min_ratio: Optional[float] = None
+    height_max_ratio: Optional[float] = None
+    width_avg_ratio: Optional[float] = None
+    width_min_ratio: Optional[float] = None
+    width_max_ratio: Optional[float] = None
+    area_avg_px: Optional[float] = None
+    area_min_px: Optional[int] = None
+    area_max_px: Optional[int] = None
+    height_avg_px: Optional[float] = None
+    height_min_px: Optional[int] = None
+    height_max_px: Optional[int] = None
+    width_avg_px: Optional[float] = None
+    width_min_px: Optional[int] = None
+    width_max_px: Optional[int] = None
     annotation_type: Optional[List[DbAnnotationType]] = None
@@ -155,8 +187,78 @@ class EntityTypeChoices(str, Enum):  # Should match `EntityTypeChoices` in `dipd
     EVENT = "EVENT"
-class DatasetImage(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
-    img: str
+class Annotations(BaseModel):
+    """
+    Used in 'DatasetImageMetadata' for visualizing image annotations
+    in gallery images on the dataset detail page.
+    """
+    objects: Optional[List[Bbox]] = None
+    classifications: Optional[List[Classification]] = None
+    polygons: Optional[List[Polygon]] = None
+    bitmasks: Optional[List[Bitmask]] = None
+class DatasetImageMetadata(BaseModel):
+    """
+    Metadata for gallery images on the dataset detail page on portal.
+    """
+    annotations: Optional[Annotations] = None
+    meta: Optional[Dict[str, Any]] = None
+    @classmethod
+    def from_sample(cls, sample: Sample) -> "DatasetImageMetadata":
+        sample = sample.model_copy(deep=True)
+        sample.file_path = "/".join(Path(sample.file_path).parts[-3:])
+        metadata = {}
+        metadata_field_names = [
+            ColumnName.FILE_PATH,
+            ColumnName.HEIGHT,
+            ColumnName.WIDTH,
+            ColumnName.SPLIT,
+        ]
+        for field_name in metadata_field_names:
+            if hasattr(sample, field_name) and getattr(sample, field_name) is not None:
+                metadata[field_name] = getattr(sample, field_name)
+        obj = DatasetImageMetadata(
+            annotations=Annotations(
+                objects=sample.objects,
+                classifications=sample.classifications,
+                polygons=sample.polygons,
+                bitmasks=sample.bitmasks,
+            ),
+            meta=metadata,
+        )
+        return obj
+class DatasetImage(Attribution, validate_assignment=True):  # type: ignore[call-arg]
+    img: str  # Base64-encoded image string
+    order: Optional[int] = None
+    metadata: Optional[DatasetImageMetadata] = None
+    @field_validator("img", mode="before")
+    def validate_image_path(cls, v: Union[str, Path]) -> str:
+        if isinstance(v, Path):
+            v = path_image_to_base64_str(path_image=v)
+        if not isinstance(v, str):
+            raise ValueError("Image must be a string or Path object representing the image path.")
+        if not v.startswith("data:image/"):
+            raise ValueError("Image must be a base64-encoded data URL.")
+        return v
+def path_image_to_base64_str(path_image: Path) -> str:
+    image = Image.open(path_image)
+    mime_format = Image.MIME[image.format]
+    as_b64 = base64.b64encode(path_image.read_bytes()).decode("ascii")
+    return f"data:{mime_format};base64,{as_b64}"
 class DbDistributionType(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -185,7 +287,10 @@ def get_folder_size(path: Path) -> int:
     return sum([path.stat().st_size for path in path.rglob("*")])
-def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset) -> dict:
+def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
+    if not upload_gallery_images:
+        dataset_update.imgs = None
     cfg = Config()
     dataset_details = dataset_update.model_dump_json()
     data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
@@ -199,9 +304,9 @@ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
     import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
     headers = {"Authorization": cfg.api_key}
-    user_logger.info("Importing dataset details. This may take up to 30 seconds...")
-    data = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
-    return data  # type: ignore[return-value]
+    user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
+    response = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
+    return response  # type: ignore[return-value]
 def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -219,7 +324,6 @@ def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Ty
     col_name = PrimitiveType.column_name()
     table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
     if col_name not in table.columns:
-        user_logger.warning(f"Warning: No field called '{col_name}' was found for '{PrimitiveType.__name__}'.")
         return False
     if table[col_name].dtype == pl.Null:
@@ -235,7 +339,7 @@ def calculate_distribution_values(
     if len(distribution_tasks) == 0:
         return []
-    classification_column = hafnia.dataset.primitives.classification.Classification.column_name()
+    classification_column = Classification.column_name()
     classifications = dataset_split.select(pl.col(classification_column).explode())
     classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
     classifications = classifications.filter(
@@ -277,6 +381,8 @@ def dataset_info_from_dataset(
     deployment_stage: DeploymentStage,
     path_sample: Optional[Path],
     path_hidden: Optional[Path],
+    path_gallery_images: Optional[Path] = None,
+    gallery_image_names: Optional[List[str]] = None,
 ) -> DbDataset:
     dataset_variants = []
     dataset_reports = []
@@ -292,6 +398,12 @@ def dataset_info_from_dataset(
     if len(path_and_variant) == 0:
         raise ValueError("At least one path must be provided for sample or hidden dataset.")
+    gallery_images = create_gallery_images(
+        dataset=dataset,
+        path_gallery_images=path_gallery_images,
+        gallery_image_names=gallery_image_names,
+    )
     for path_dataset, variant_type in path_and_variant:
         if variant_type == DatasetVariant.SAMPLE:
             dataset_variant = dataset.create_sample_dataset()
@@ -331,19 +443,26 @@ def dataset_info_from_dataset(
             )
             object_reports: List[DbAnnotatedObjectReport] = []
-            primitive_columns = [tPrimtive.column_name() for tPrimtive in primitives.PRIMITIVE_TYPES]
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bbox.Bbox):
-                bbox_column_name = hafnia.dataset.primitives.bbox.Bbox.column_name()
-                drop_columns = [col for col in primitive_columns if col != bbox_column_name]
-                drop_columns.append(FieldName.META)
-                df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
-                df_per_instance = df_per_instance.explode(bbox_column_name).drop(drop_columns).unnest(bbox_column_name)
+            primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
+            if has_primitive(dataset_split, PrimitiveType=Bbox):
+                df_per_instance = table_transformations.create_primitive_table(
+                    dataset_split, PrimitiveType=Bbox, keep_sample_data=True
+                )
+                if df_per_instance is None:
+                    raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
                 # Calculate area of bounding boxes
-                df_per_instance = df_per_instance.with_columns((pl.col("height") * pl.col("width")).alias("area"))
+                df_per_instance = df_per_instance.with_columns(
+                    (pl.col("height") * pl.col("width")).alias("area"),
+                ).with_columns(
+                    (pl.col("height") * pl.col("image.height")).alias("height_px"),
+                    (pl.col("width") * pl.col("image.width")).alias("width_px"),
+                    (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
+                )
                 annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
-                for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
+                for (class_name, task_name), class_group in df_per_instance.group_by(
+                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                ):
                     if class_name is None:
                         continue
                     object_reports.append(
@@ -351,25 +470,39 @@ def dataset_info_from_dataset(
                             obj=DbAnnotatedObject(
                                 name=class_name,
                                 entity_type=EntityTypeChoices.OBJECT.value,
+                                annotation_type=annotation_type,
+                                task_name=task_name,
                             ),
                             unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
-                            avg_area=class_group["area"].mean(),
-                            min_area=class_group["area"].min(),
-                            max_area=class_group["area"].max(),
+                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            area_avg_ratio=class_group["area"].mean(),
+                            area_min_ratio=class_group["area"].min(),
+                            area_max_ratio=class_group["area"].max(),
+                            height_avg_ratio=class_group["height"].mean(),
+                            height_min_ratio=class_group["height"].min(),
+                            height_max_ratio=class_group["height"].max(),
+                            width_avg_ratio=class_group["width"].mean(),
+                            width_min_ratio=class_group["width"].min(),
+                            width_max_ratio=class_group["width"].max(),
+                            area_avg_px=class_group["area_px"].mean(),
+                            area_min_px=int(class_group["area_px"].min()),
+                            area_max_px=int(class_group["area_px"].max()),
+                            height_avg_px=class_group["height_px"].mean(),
+                            height_min_px=int(class_group["height_px"].min()),
+                            height_max_px=int(class_group["height_px"].max()),
+                            width_avg_px=class_group["width_px"].mean(),
+                            width_min_px=int(class_group["width_px"].min()),
+                            width_max_px=int(class_group["width_px"].max()),
                             average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
                         )
                     )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.classification.Classification):
+            if has_primitive(dataset_split, PrimitiveType=Classification):
                 annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
-                col_name = hafnia.dataset.primitives.classification.Classification.column_name()
-                classification_tasks = [
-                    task.name
-                    for task in dataset.info.tasks
-                    if task.primitive == hafnia.dataset.primitives.classification.Classification
-                ]
+                col_name = Classification.column_name()
+                classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
                 has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
                 if has_classification_data:
                     classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
@@ -385,7 +518,7 @@ def dataset_info_from_dataset(
                     ), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
                         if class_name is None:
                             continue
-                        if task_name == hafnia.dataset.primitives.classification.Classification.default_task_name():
+                        if task_name == Classification.default_task_name():
                             display_name = class_name  # Prefix class name with task name
                         else:
                             display_name = f"{task_name}.{class_name}"
@@ -394,6 +527,8 @@ def dataset_info_from_dataset(
                                 obj=DbAnnotatedObject(
                                     name=display_name,
                                     entity_type=EntityTypeChoices.EVENT.value,
+                                    annotation_type=annotation_type,
+                                    task_name=task_name,
                                 ),
                                 unique_obj_ids=len(
                                     class_group
@@ -403,22 +538,32 @@ def dataset_info_from_dataset(
                             )
                         )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.segmentation.Segmentation):
+            if has_primitive(dataset_split, PrimitiveType=Segmentation):
                 raise NotImplementedError("Not Implemented yet")
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bitmask.Bitmask):
-                col_name = hafnia.dataset.primitives.bitmask.Bitmask.column_name()
+            if has_primitive(dataset_split, PrimitiveType=Bitmask):
+                col_name = Bitmask.column_name()
                 drop_columns = [col for col in primitive_columns if col != col_name]
                 drop_columns.append(FieldName.META)
-                df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
-                df_per_instance = df_per_instance.explode(col_name).drop(drop_columns).unnest(col_name)
-                min_area = df_per_instance["area"].min() if "area" in df_per_instance.columns else None
-                max_area = df_per_instance["area"].max() if "area" in df_per_instance.columns else None
-                avg_area = df_per_instance["area"].mean() if "area" in df_per_instance.columns else None
+                df_per_instance = table_transformations.create_primitive_table(
+                    dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
+                )
+                if df_per_instance is None:
+                    raise ValueError(
+                        f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
+                    )
+                df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
+                df_per_instance = df_per_instance.with_columns(
+                    (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
+                    (pl.col("height_px") / pl.col("image.height")).alias("height"),
+                    (pl.col("width_px") / pl.col("image.width")).alias("width"),
+                )
                 annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
-                for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
+                for (class_name, task_name), class_group in df_per_instance.group_by(
+                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                ):
                     if class_name is None:
                         continue
                     object_reports.append(
@@ -426,18 +571,36 @@ def dataset_info_from_dataset(
                             obj=DbAnnotatedObject(
                                 name=class_name,
                                 entity_type=EntityTypeChoices.OBJECT.value,
+                                annotation_type=annotation_type,
+                                task_name=task_name,
                             ),
                             unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
                             average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
-                            avg_area=avg_area,
-                            min_area=min_area,
-                            max_area=max_area,
+                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            area_avg_ratio=class_group["area"].mean(),
+                            area_min_ratio=class_group["area"].min(),
+                            area_max_ratio=class_group["area"].max(),
+                            height_avg_ratio=class_group["height"].mean(),
+                            height_min_ratio=class_group["height"].min(),
+                            height_max_ratio=class_group["height"].max(),
+                            width_avg_ratio=class_group["width"].mean(),
+                            width_min_ratio=class_group["width"].min(),
+                            width_max_ratio=class_group["width"].max(),
+                            area_avg_px=class_group["area_px"].mean(),
+                            area_min_px=int(class_group["area_px"].min()),
+                            area_max_px=int(class_group["area_px"].max()),
+                            height_avg_px=class_group["height_px"].mean(),
+                            height_min_px=int(class_group["height_px"].min()),
+                            height_max_px=int(class_group["height_px"].max()),
+                            width_avg_px=class_group["width_px"].mean(),
+                            width_min_px=int(class_group["width_px"].min()),
+                            width_max_px=int(class_group["width_px"].max()),
                         )
                     )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.polygon.Polygon):
+            if has_primitive(dataset_split, PrimitiveType=Polygon):
                 raise NotImplementedError("Not Implemented yet")
             # Sort object reports by name to more easily compare between versions
@@ -456,13 +619,59 @@ def dataset_info_from_dataset(
         s3_bucket_name=bucket_sample,
         dataset_variants=dataset_variants,
         split_annotations_reports=dataset_reports,
-        license_citation=dataset_meta_info.get("license_citation", None),
+        latest_update=dataset.info.updated_at,
+        dataset_format_version=dataset.info.format_version,
+        license_citation=dataset.info.reference_bibtex,
         data_captured_start=dataset_meta_info.get("data_captured_start", None),
         data_captured_end=dataset_meta_info.get("data_captured_end", None),
         data_received_start=dataset_meta_info.get("data_received_start", None),
         data_received_end=dataset_meta_info.get("data_received_end", None),
         annotation_project_id=dataset_meta_info.get("annotation_project_id", None),
         annotation_dataset_id=dataset_meta_info.get("annotation_dataset_id", None),
+        imgs=gallery_images,
     )
     return dataset_info
+def create_gallery_images(
+    dataset: HafniaDataset,
+    path_gallery_images: Optional[Path],
+    gallery_image_names: Optional[List[str]],
+) -> Optional[List[DatasetImage]]:
+    gallery_images = None
+    if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
+        if path_gallery_images is None:
+            raise ValueError("Path to gallery images must be provided.")
+        path_gallery_images.mkdir(parents=True, exist_ok=True)
+        COL_IMAGE_NAME = "image_name"
+        samples = dataset.samples.with_columns(
+            dataset.samples[ColumnName.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
+        )
+        gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))
+        missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
+        if len(missing_gallery_samples):
+            raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
+        gallery_images = []
+        for gallery_sample in gallery_samples.iter_rows(named=True):
+            sample = Sample(**gallery_sample)
+            metadata = DatasetImageMetadata.from_sample(sample=sample)
+            sample.classifications = None  # To not draw classifications in gallery images
+            image = sample.draw_annotations()
+            path_gallery_image = path_gallery_images / gallery_sample[COL_IMAGE_NAME]
+            Image.fromarray(image).save(path_gallery_image)
+            dataset_image_dict = {
+                "img": path_gallery_image,
+                "metadata": metadata,
+            }
+            if sample.attribution is not None:
+                sample.attribution.changes = "Annotations have been visualized"
+                dataset_image_dict.update(sample.attribution.model_dump(exclude_none=True))
+            gallery_img = DatasetImage(**dataset_image_dict)
+            gallery_img.licenses = gallery_img.licenses or []
+            gallery_images.append(gallery_img)
+    return gallery_images

hafnia/dataset/format_conversions/image_classification_from_directory.py ADDED Viewed

@@ -0,0 +1,106 @@
+import shutil
+from pathlib import Path
+from typing import List, Optional
+import more_itertools
+import polars as pl
+from PIL import Image
+from rich.progress import track
+from hafnia.dataset.dataset_names import ColumnName, FieldName
+from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.primitives import Classification
+from hafnia.utils import is_image_file
+def import_image_classification_directory_tree(
+    path_folder: Path,
+    split: str,
+    n_samples: Optional[int] = None,
+) -> HafniaDataset:
+    class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
+    class_names = sorted([folder.name for folder in class_folder_paths])  # Sort for determinism
+    # Gather all image paths per class
+    path_images_per_class: List[List[Path]] = []
+    for path_class_folder in class_folder_paths:
+        per_class_images = []
+        for path_image in list(path_class_folder.rglob("*.*")):
+            if is_image_file(path_image):
+                per_class_images.append(path_image)
+        path_images_per_class.append(sorted(per_class_images))
+    # Interleave to ensure classes are balanced in the output dataset for n_samples < total
+    path_images = list(more_itertools.interleave_longest(*path_images_per_class))
+    if n_samples is not None:
+        path_images = path_images[:n_samples]
+    samples = []
+    for path_image_org in track(path_images, description="Convert 'image classification' dataset to Hafnia Dataset"):
+        class_name = path_image_org.parent.name
+        read_image = Image.open(path_image_org)
+        width, height = read_image.size
+        classifications = [Classification(class_name=class_name, class_idx=class_names.index(class_name))]
+        sample = Sample(
+            file_path=str(path_image_org.absolute()),
+            width=width,
+            height=height,
+            split=split,
+            classifications=classifications,
+        )
+        samples.append(sample)
+    dataset_info = DatasetInfo(
+        dataset_name="ImageClassificationFromDirectoryTree",
+        tasks=[TaskInfo(primitive=Classification, class_names=class_names)],
+    )
+    hafnia_dataset = HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
+    return hafnia_dataset
+def export_image_classification_directory_tree(
+    dataset: HafniaDataset,
+    path_output: Path,
+    task_name: Optional[str] = None,
+    clean_folder: bool = False,
+) -> Path:
+    task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
+    samples = dataset.samples.with_columns(
+        pl.col(task.primitive.column_name())
+        .list.filter(pl.element().struct.field(FieldName.TASK_NAME) == task.name)
+        .alias(task.primitive.column_name())
+    )
+    classification_counts = samples[task.primitive.column_name()].list.len()
+    has_no_classification_samples = (classification_counts == 0).sum()
+    if has_no_classification_samples > 0:
+        raise ValueError(f"Some samples do not have a classification for task '{task.name}'.")
+    has_multi_classification_samples = (classification_counts > 1).sum()
+    if has_multi_classification_samples > 0:
+        raise ValueError(f"Some samples have multiple classifications for task '{task.name}'.")
+    if clean_folder:
+        shutil.rmtree(path_output, ignore_errors=True)
+    path_output.mkdir(parents=True, exist_ok=True)
+    description = "Export Hafnia Dataset to directory tree"
+    for sample_dict in track(samples.iter_rows(named=True), total=len(samples), description=description):
+        classifications = sample_dict[task.primitive.column_name()]
+        if len(classifications) != 1:
+            raise ValueError("Each sample should have exactly one classification.")
+        classification = classifications[0]
+        class_name = classification[FieldName.CLASS_NAME].replace("/", "_")  # Avoid issues with subfolders
+        path_class_folder = path_output / class_name
+        path_class_folder.mkdir(parents=True, exist_ok=True)
+        path_image_org = Path(sample_dict[ColumnName.FILE_PATH])
+        path_image_new = path_class_folder / path_image_org.name
+        shutil.copy2(path_image_org, path_image_new)
+    return path_output

hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

hafnia 0.2.4py3-none-any.whl → 0.4.0py3-none-any.whl