PyPI - hafnia - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

hafnia 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

cli/__main__.py +13 -2
cli/config.py +2 -1
cli/consts.py +1 -1
cli/dataset_cmds.py +6 -14
cli/dataset_recipe_cmds.py +78 -0
cli/experiment_cmds.py +226 -43
cli/profile_cmds.py +6 -5
cli/runc_cmds.py +5 -5
cli/trainer_package_cmds.py +65 -0
hafnia/__init__.py +2 -0
hafnia/data/factory.py +1 -2
hafnia/dataset/dataset_helpers.py +0 -12
hafnia/dataset/dataset_names.py +8 -4
hafnia/dataset/dataset_recipe/dataset_recipe.py +119 -33
hafnia/dataset/dataset_recipe/recipe_transforms.py +32 -4
hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
hafnia/dataset/dataset_upload_helper.py +206 -53
hafnia/dataset/hafnia_dataset.py +432 -194
hafnia/dataset/license_types.py +63 -0
hafnia/dataset/operations/dataset_stats.py +260 -3
hafnia/dataset/operations/dataset_transformations.py +325 -4
hafnia/dataset/operations/table_transformations.py +39 -2
hafnia/dataset/primitives/__init__.py +8 -0
hafnia/dataset/primitives/classification.py +1 -1
hafnia/experiment/hafnia_logger.py +112 -0
hafnia/http.py +16 -2
hafnia/platform/__init__.py +9 -3
hafnia/platform/builder.py +12 -10
hafnia/platform/dataset_recipe.py +99 -0
hafnia/platform/datasets.py +44 -6
hafnia/platform/download.py +2 -1
hafnia/platform/experiment.py +51 -56
hafnia/platform/trainer_package.py +57 -0
hafnia/utils.py +64 -13
hafnia/visualizations/image_visualizations.py +3 -3
{hafnia-0.2.4.dist-info → hafnia-0.3.0.dist-info}/METADATA +34 -30
hafnia-0.3.0.dist-info/RECORD +53 -0
cli/recipe_cmds.py +0 -45
hafnia-0.2.4.dist-info/RECORD +0 -49
{hafnia-0.2.4.dist-info → hafnia-0.3.0.dist-info}/WHEEL +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.3.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.2.4.dist-info → hafnia-0.3.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/dataset_upload_helper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import base64
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
@@ -7,13 +8,9 @@ from typing import Dict, List, Optional, Tuple, Type, Union
 import boto3
 import polars as pl
-from pydantic import BaseModel, ConfigDict
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, field_validator
-import hafnia.dataset.primitives.bbox
-import hafnia.dataset.primitives.bitmask
-import hafnia.dataset.primitives.classification
-import hafnia.dataset.primitives.polygon
-import hafnia.dataset.primitives.segmentation
 from cli.config import Config
 from hafnia.dataset import primitives
 from hafnia.dataset.dataset_names import (
@@ -23,11 +20,19 @@ from hafnia.dataset.dataset_names import (
     FieldName,
     SplitName,
 )
-from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
+from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.operations import table_transformations
+from hafnia.dataset.primitives import (
+    Bbox,
+    Bitmask,
+    Classification,
+    Polygon,
+    Segmentation,
+)
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.http import post
 from hafnia.log import user_logger
-from hafnia.platform import get_dataset_id
+from hafnia.platform.datasets import get_dataset_id
 def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
@@ -53,7 +58,7 @@ class DbDataset(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     annotation_ontology: Optional[str] = None
     dataset_variants: Optional[List[DbDatasetVariant]] = None
     split_annotations_reports: Optional[List[DbSplitAnnotationsReport]] = None
-    dataset_images: Optional[List[DatasetImage]] = None
+    imgs: Optional[List[DatasetImage]] = None
 class DbDatasetVariant(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -75,6 +80,8 @@ class DbAnnotatedObject(BaseModel, validate_assignment=True):  # type: ignore[ca
     model_config = ConfigDict(use_enum_values=True)  # To parse Enum values as strings
     name: str
     entity_type: EntityTypeChoices
+    annotation_type: DbAnnotationType
+    task_name: Optional[str] = None  # Not sure if adding task_name makes sense.
 class DbAnnotatedObjectReport(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -82,10 +89,34 @@ class DbAnnotatedObjectReport(BaseModel, validate_assignment=True):  # type: ign
     obj: DbAnnotatedObject
     unique_obj_ids: Optional[int] = None
     obj_instances: Optional[int] = None
+    images_with_obj: Optional[int] = None
     average_count_per_image: Optional[float] = None
-    avg_area: Optional[float] = None
-    min_area: Optional[float] = None
-    max_area: Optional[float] = None
+    area_avg_ratio: Optional[float] = None
+    area_min_ratio: Optional[float] = None
+    area_max_ratio: Optional[float] = None
+    height_avg_ratio: Optional[float] = None
+    height_min_ratio: Optional[float] = None
+    height_max_ratio: Optional[float] = None
+    width_avg_ratio: Optional[float] = None
+    width_min_ratio: Optional[float] = None
+    width_max_ratio: Optional[float] = None
+    area_avg_px: Optional[float] = None
+    area_min_px: Optional[int] = None
+    area_max_px: Optional[int] = None
+    height_avg_px: Optional[float] = None
+    height_min_px: Optional[int] = None
+    height_max_px: Optional[int] = None
+    width_avg_px: Optional[float] = None
+    width_min_px: Optional[int] = None
+    width_max_px: Optional[int] = None
     annotation_type: Optional[List[DbAnnotationType]] = None
@@ -155,8 +186,29 @@ class EntityTypeChoices(str, Enum):  # Should match `EntityTypeChoices` in `dipd
     EVENT = "EVENT"
-class DatasetImage(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
-    img: str
+class DatasetImage(Attribution, validate_assignment=True):  # type: ignore[call-arg]
+    img: str  # Base64-encoded image string
+    order: Optional[int] = None
+    @field_validator("img", mode="before")
+    def validate_image_path(cls, v: Union[str, Path]) -> str:
+        if isinstance(v, Path):
+            v = path_image_to_base64_str(path_image=v)
+        if not isinstance(v, str):
+            raise ValueError("Image must be a string or Path object representing the image path.")
+        if not v.startswith("data:image/"):
+            raise ValueError("Image must be a base64-encoded data URL.")
+        return v
+def path_image_to_base64_str(path_image: Path) -> str:
+    image = Image.open(path_image)
+    mime_format = Image.MIME[image.format]
+    as_b64 = base64.b64encode(path_image.read_bytes()).decode("ascii")
+    return f"data:{mime_format};base64,{as_b64}"
 class DbDistributionType(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
@@ -185,7 +237,10 @@ def get_folder_size(path: Path) -> int:
     return sum([path.stat().st_size for path in path.rglob("*")])
-def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset) -> dict:
+def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
+    if not upload_gallery_images:
+        dataset_update.imgs = None
     cfg = Config()
     dataset_details = dataset_update.model_dump_json()
     data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
@@ -200,8 +255,8 @@ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
     headers = {"Authorization": cfg.api_key}
     user_logger.info("Importing dataset details. This may take up to 30 seconds...")
-    data = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
-    return data  # type: ignore[return-value]
+    response = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
+    return response  # type: ignore[return-value]
 def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -219,7 +274,6 @@ def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Ty
     col_name = PrimitiveType.column_name()
     table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
     if col_name not in table.columns:
-        user_logger.warning(f"Warning: No field called '{col_name}' was found for '{PrimitiveType.__name__}'.")
         return False
     if table[col_name].dtype == pl.Null:
@@ -235,7 +289,7 @@ def calculate_distribution_values(
     if len(distribution_tasks) == 0:
         return []
-    classification_column = hafnia.dataset.primitives.classification.Classification.column_name()
+    classification_column = Classification.column_name()
     classifications = dataset_split.select(pl.col(classification_column).explode())
     classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
     classifications = classifications.filter(
@@ -277,6 +331,8 @@ def dataset_info_from_dataset(
     deployment_stage: DeploymentStage,
     path_sample: Optional[Path],
     path_hidden: Optional[Path],
+    path_gallery_images: Optional[Path] = None,
+    gallery_image_names: Optional[List[str]] = None,
 ) -> DbDataset:
     dataset_variants = []
     dataset_reports = []
@@ -292,6 +348,12 @@ def dataset_info_from_dataset(
     if len(path_and_variant) == 0:
         raise ValueError("At least one path must be provided for sample or hidden dataset.")
+    gallery_images = create_gallery_images(
+        dataset=dataset,
+        path_gallery_images=path_gallery_images,
+        gallery_image_names=gallery_image_names,
+    )
     for path_dataset, variant_type in path_and_variant:
         if variant_type == DatasetVariant.SAMPLE:
             dataset_variant = dataset.create_sample_dataset()
@@ -331,19 +393,26 @@ def dataset_info_from_dataset(
             )
             object_reports: List[DbAnnotatedObjectReport] = []
-            primitive_columns = [tPrimtive.column_name() for tPrimtive in primitives.PRIMITIVE_TYPES]
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bbox.Bbox):
-                bbox_column_name = hafnia.dataset.primitives.bbox.Bbox.column_name()
-                drop_columns = [col for col in primitive_columns if col != bbox_column_name]
-                drop_columns.append(FieldName.META)
-                df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
-                df_per_instance = df_per_instance.explode(bbox_column_name).drop(drop_columns).unnest(bbox_column_name)
+            primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
+            if has_primitive(dataset_split, PrimitiveType=Bbox):
+                df_per_instance = table_transformations.create_primitive_table(
+                    dataset_split, PrimitiveType=Bbox, keep_sample_data=True
+                )
+                if df_per_instance is None:
+                    raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
                 # Calculate area of bounding boxes
-                df_per_instance = df_per_instance.with_columns((pl.col("height") * pl.col("width")).alias("area"))
+                df_per_instance = df_per_instance.with_columns(
+                    (pl.col("height") * pl.col("width")).alias("area"),
+                ).with_columns(
+                    (pl.col("height") * pl.col("image.height")).alias("height_px"),
+                    (pl.col("width") * pl.col("image.width")).alias("width_px"),
+                    (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
+                )
                 annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
-                for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
+                for (class_name, task_name), class_group in df_per_instance.group_by(
+                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                ):
                     if class_name is None:
                         continue
                     object_reports.append(
@@ -351,25 +420,39 @@ def dataset_info_from_dataset(
                             obj=DbAnnotatedObject(
                                 name=class_name,
                                 entity_type=EntityTypeChoices.OBJECT.value,
+                                annotation_type=annotation_type,
+                                task_name=task_name,
                             ),
                             unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
-                            avg_area=class_group["area"].mean(),
-                            min_area=class_group["area"].min(),
-                            max_area=class_group["area"].max(),
+                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            area_avg_ratio=class_group["area"].mean(),
+                            area_min_ratio=class_group["area"].min(),
+                            area_max_ratio=class_group["area"].max(),
+                            height_avg_ratio=class_group["height"].mean(),
+                            height_min_ratio=class_group["height"].min(),
+                            height_max_ratio=class_group["height"].max(),
+                            width_avg_ratio=class_group["width"].mean(),
+                            width_min_ratio=class_group["width"].min(),
+                            width_max_ratio=class_group["width"].max(),
+                            area_avg_px=class_group["area_px"].mean(),
+                            area_min_px=int(class_group["area_px"].min()),
+                            area_max_px=int(class_group["area_px"].max()),
+                            height_avg_px=class_group["height_px"].mean(),
+                            height_min_px=int(class_group["height_px"].min()),
+                            height_max_px=int(class_group["height_px"].max()),
+                            width_avg_px=class_group["width_px"].mean(),
+                            width_min_px=int(class_group["width_px"].min()),
+                            width_max_px=int(class_group["width_px"].max()),
                             average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
                         )
                     )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.classification.Classification):
+            if has_primitive(dataset_split, PrimitiveType=Classification):
                 annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
-                col_name = hafnia.dataset.primitives.classification.Classification.column_name()
-                classification_tasks = [
-                    task.name
-                    for task in dataset.info.tasks
-                    if task.primitive == hafnia.dataset.primitives.classification.Classification
-                ]
+                col_name = Classification.column_name()
+                classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
                 has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
                 if has_classification_data:
                     classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
@@ -385,7 +468,7 @@ def dataset_info_from_dataset(
                     ), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
                         if class_name is None:
                             continue
-                        if task_name == hafnia.dataset.primitives.classification.Classification.default_task_name():
+                        if task_name == Classification.default_task_name():
                             display_name = class_name  # Prefix class name with task name
                         else:
                             display_name = f"{task_name}.{class_name}"
@@ -394,6 +477,8 @@ def dataset_info_from_dataset(
                                 obj=DbAnnotatedObject(
                                     name=display_name,
                                     entity_type=EntityTypeChoices.EVENT.value,
+                                    annotation_type=annotation_type,
+                                    task_name=task_name,
                                 ),
                                 unique_obj_ids=len(
                                     class_group
@@ -403,22 +488,32 @@ def dataset_info_from_dataset(
                             )
                         )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.segmentation.Segmentation):
+            if has_primitive(dataset_split, PrimitiveType=Segmentation):
                 raise NotImplementedError("Not Implemented yet")
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bitmask.Bitmask):
-                col_name = hafnia.dataset.primitives.bitmask.Bitmask.column_name()
+            if has_primitive(dataset_split, PrimitiveType=Bitmask):
+                col_name = Bitmask.column_name()
                 drop_columns = [col for col in primitive_columns if col != col_name]
                 drop_columns.append(FieldName.META)
-                df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
-                df_per_instance = df_per_instance.explode(col_name).drop(drop_columns).unnest(col_name)
-                min_area = df_per_instance["area"].min() if "area" in df_per_instance.columns else None
-                max_area = df_per_instance["area"].max() if "area" in df_per_instance.columns else None
-                avg_area = df_per_instance["area"].mean() if "area" in df_per_instance.columns else None
+                df_per_instance = table_transformations.create_primitive_table(
+                    dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
+                )
+                if df_per_instance is None:
+                    raise ValueError(
+                        f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
+                    )
+                df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
+                df_per_instance = df_per_instance.with_columns(
+                    (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
+                    (pl.col("height_px") / pl.col("image.height")).alias("height"),
+                    (pl.col("width_px") / pl.col("image.width")).alias("width"),
+                )
                 annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
-                for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
+                for (class_name, task_name), class_group in df_per_instance.group_by(
+                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                ):
                     if class_name is None:
                         continue
                     object_reports.append(
@@ -426,18 +521,36 @@ def dataset_info_from_dataset(
                             obj=DbAnnotatedObject(
                                 name=class_name,
                                 entity_type=EntityTypeChoices.OBJECT.value,
+                                annotation_type=annotation_type,
+                                task_name=task_name,
                             ),
                             unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
                             average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
-                            avg_area=avg_area,
-                            min_area=min_area,
-                            max_area=max_area,
+                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            area_avg_ratio=class_group["area"].mean(),
+                            area_min_ratio=class_group["area"].min(),
+                            area_max_ratio=class_group["area"].max(),
+                            height_avg_ratio=class_group["height"].mean(),
+                            height_min_ratio=class_group["height"].min(),
+                            height_max_ratio=class_group["height"].max(),
+                            width_avg_ratio=class_group["width"].mean(),
+                            width_min_ratio=class_group["width"].min(),
+                            width_max_ratio=class_group["width"].max(),
+                            area_avg_px=class_group["area_px"].mean(),
+                            area_min_px=int(class_group["area_px"].min()),
+                            area_max_px=int(class_group["area_px"].max()),
+                            height_avg_px=class_group["height_px"].mean(),
+                            height_min_px=int(class_group["height_px"].min()),
+                            height_max_px=int(class_group["height_px"].max()),
+                            width_avg_px=class_group["width_px"].mean(),
+                            width_min_px=int(class_group["width_px"].min()),
+                            width_max_px=int(class_group["width_px"].max()),
                         )
                     )
-            if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.polygon.Polygon):
+            if has_primitive(dataset_split, PrimitiveType=Polygon):
                 raise NotImplementedError("Not Implemented yet")
             # Sort object reports by name to more easily compare between versions
@@ -463,6 +576,46 @@ def dataset_info_from_dataset(
         data_received_end=dataset_meta_info.get("data_received_end", None),
         annotation_project_id=dataset_meta_info.get("annotation_project_id", None),
         annotation_dataset_id=dataset_meta_info.get("annotation_dataset_id", None),
+        imgs=gallery_images,
     )
     return dataset_info
+def create_gallery_images(
+    dataset: HafniaDataset,
+    path_gallery_images: Optional[Path],
+    gallery_image_names: Optional[List[str]],
+) -> Optional[List[DatasetImage]]:
+    gallery_images = None
+    if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
+        if path_gallery_images is None:
+            raise ValueError("Path to gallery images must be provided.")
+        path_gallery_images.mkdir(parents=True, exist_ok=True)
+        COL_IMAGE_NAME = "image_name"
+        samples = dataset.samples.with_columns(
+            dataset.samples[ColumnName.FILE_NAME].str.split("/").list.last().alias(COL_IMAGE_NAME)
+        )
+        gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))
+        missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
+        if len(missing_gallery_samples):
+            raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
+        gallery_images = []
+        for gallery_sample in gallery_samples.iter_rows(named=True):
+            sample = Sample(**gallery_sample)
+            image = sample.draw_annotations()
+            path_gallery_image = path_gallery_images / gallery_sample[COL_IMAGE_NAME]
+            Image.fromarray(image).save(path_gallery_image)
+            dataset_image_dict = {
+                "img": path_gallery_image,
+            }
+            if sample.attribution is not None:
+                sample.attribution.changes = "Annotations have been visualized"
+                dataset_image_dict.update(sample.attribution.model_dump(exclude_none=True))
+            gallery_img = DatasetImage(**dataset_image_dict)
+            gallery_img.licenses = gallery_img.licenses or []
+            gallery_images.append(gallery_img)
+    return gallery_images

hafnia 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

hafnia 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl