PyPI - hafnia - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hafnia 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

cli/__main__.py +3 -1
cli/config.py +43 -3
cli/keychain.py +88 -0
cli/profile_cmds.py +5 -2
hafnia/__init__.py +1 -1
hafnia/dataset/dataset_helpers.py +9 -2
hafnia/dataset/dataset_names.py +2 -1
hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
hafnia/dataset/dataset_upload_helper.py +60 -4
hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
hafnia/dataset/hafnia_dataset.py +176 -50
hafnia/dataset/operations/dataset_stats.py +2 -3
hafnia/dataset/operations/dataset_transformations.py +19 -15
hafnia/dataset/operations/table_transformations.py +4 -3
hafnia/dataset/primitives/bbox.py +25 -12
hafnia/dataset/primitives/bitmask.py +26 -14
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +16 -9
hafnia/dataset/primitives/segmentation.py +10 -7
hafnia/experiment/hafnia_logger.py +0 -9
hafnia/platform/dataset_recipe.py +7 -2
hafnia/platform/datasets.py +3 -3
hafnia/platform/download.py +23 -18
hafnia/utils.py +17 -0
hafnia/visualizations/image_visualizations.py +1 -1
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/METADATA +8 -6
hafnia-0.4.0.dist-info/RECORD +56 -0
hafnia-0.3.0.dist-info/RECORD +0 -53
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0

cli/__main__.py CHANGED Viewed

@@ -37,7 +37,9 @@ def configure(cfg: Config) -> None:
     platform_url = click.prompt("Hafnia Platform URL", type=str, default=consts.DEFAULT_API_URL)
-    cfg_profile = ConfigSchema(api_key=api_key, platform_url=platform_url)
+    use_keychain = click.confirm("Store API key in system keychain?", default=False)
+    cfg_profile = ConfigSchema(platform_url=platform_url, api_key=api_key, use_keychain=use_keychain)
     cfg.add_profile(profile_name, cfg_profile, set_active=True)
     cfg.save_config()
     profile_cmds.profile_show(cfg)

cli/config.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict, List, Optional
 from pydantic import BaseModel, field_validator
 import cli.consts as consts
+import cli.keychain as keychain
 from hafnia.log import sys_logger, user_logger
 PLATFORM_API_MAPPING = {
@@ -19,9 +20,18 @@ PLATFORM_API_MAPPING = {
 }
+class SecretStr(str):
+    def __repr__(self):
+        return "********"
+    def __str__(self):
+        return "********"
 class ConfigSchema(BaseModel):
     platform_url: str = ""
     api_key: Optional[str] = None
+    use_keychain: bool = False
     @field_validator("api_key")
     def validate_api_key(cls, value: Optional[str]) -> Optional[str]:
@@ -35,7 +45,7 @@ class ConfigSchema(BaseModel):
             sys_logger.warning("API key is missing the 'ApiKey ' prefix. Prefix is being added automatically.")
             value = f"ApiKey {value}"
-        return value
+        return SecretStr(value)  # Keeps the API key masked in logs and repr
 class ConfigFileSchema(BaseModel):
@@ -70,13 +80,32 @@ class Config:
     @property
     def api_key(self) -> str:
+        # Check keychain first if enabled
+        if self.config.use_keychain:
+            keychain_key = keychain.get_api_key(self.active_profile)
+            if keychain_key is not None:
+                return keychain_key
+        # Fall back to config file
         if self.config.api_key is not None:
             return self.config.api_key
         raise ValueError(consts.ERROR_API_KEY_NOT_SET)
     @api_key.setter
     def api_key(self, value: str) -> None:
-        self.config.api_key = value
+        # Store in keychain if enabled
+        if self.config.use_keychain:
+            if keychain.store_api_key(self.active_profile, value):
+                # Successfully stored in keychain, don't store in config
+                self.config.api_key = None
+            else:
+                # Keychain storage failed, fall back to config file
+                sys_logger.warning("Failed to store in keychain, falling back to config file")
+                self.config.api_key = value
+        else:
+            # Not using keychain, store in config file
+            self.config.api_key = value
     @property
     def platform_url(self) -> str:
@@ -152,8 +181,19 @@ class Config:
             raise ValueError("Failed to parse configuration file")
     def save_config(self) -> None:
+        # Create a copy to avoid modifying the original data
+        config_to_save = self.config_data.model_dump()
+        # Store API key in keychain if enabled, and don't write to file
+        for profile_name, profile_data in config_to_save["profiles"].items():
+            if profile_data.get("use_keychain", False):
+                api_key = profile_data.get("api_key")
+                if api_key:
+                    keychain.store_api_key(profile_name, api_key)
+                profile_data["api_key"] = None
         with open(self.config_path, "w") as f:
-            json.dump(self.config_data.model_dump(), f, indent=4)
+            json.dump(config_to_save, f, indent=4)
     def remove_profile(self, profile_name: str) -> None:
         if profile_name not in self.config_data.profiles:

cli/keychain.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Keychain storage for API keys using the system keychain."""
+from typing import Optional
+from hafnia.log import sys_logger
+# Keyring is optional - gracefully degrade if not available
+try:
+    import keyring
+    KEYRING_AVAILABLE = True
+except ImportError:
+    KEYRING_AVAILABLE = False
+    sys_logger.debug("keyring library not available, keychain storage disabled")
+KEYRING_SERVICE_NAME = "hafnia-cli"
+def store_api_key(profile_name: str, api_key: str) -> bool:
+    """
+    Store an API key in the system keychain.
+    Args:
+        profile_name: The profile name to associate with the key
+        api_key: The API key to store
+    Returns:
+        True if successfully stored, False otherwise
+    """
+    if not KEYRING_AVAILABLE:
+        sys_logger.warning("Keyring library not available, cannot store API key in keychain")
+        return False
+    try:
+        keyring.set_password(KEYRING_SERVICE_NAME, profile_name, api_key)
+        sys_logger.debug(f"Stored API key for profile '{profile_name}' in keychain")
+        return True
+    except Exception as e:
+        sys_logger.warning(f"Failed to store API key in keychain: {e}")
+        return False
+def get_api_key(profile_name: str) -> Optional[str]:
+    """
+    Retrieve an API key from the system keychain.
+    Args:
+        profile_name: The profile name to retrieve the key for
+    Returns:
+        The API key if found, None otherwise
+    """
+    if not KEYRING_AVAILABLE:
+        return None
+    try:
+        api_key = keyring.get_password(KEYRING_SERVICE_NAME, profile_name)
+        if api_key:
+            sys_logger.debug(f"Retrieved API key for profile '{profile_name}' from keychain")
+        return api_key
+    except Exception as e:
+        sys_logger.warning(f"Failed to retrieve API key from keychain: {e}")
+        return None
+def delete_api_key(profile_name: str) -> bool:
+    """
+    Delete an API key from the system keychain.
+    Args:
+        profile_name: The profile name to delete the key for
+    Returns:
+        True if successfully deleted or didn't exist, False on error
+    """
+    if not KEYRING_AVAILABLE:
+        return False
+    try:
+        keyring.delete_password(KEYRING_SERVICE_NAME, profile_name)
+        sys_logger.debug(f"Deleted API key for profile '{profile_name}' from keychain")
+        return True
+    except keyring.errors.PasswordDeleteError:
+        # Key didn't exist, which is fine
+        return True
+    except Exception as e:
+        sys_logger.warning(f"Failed to delete API key from keychain: {e}")
+        return False

cli/profile_cmds.py CHANGED Viewed

@@ -50,10 +50,13 @@ def cmd_profile_use(cfg: Config, profile_name: str) -> None:
 @click.option(
     "--activate/--no-activate", help="Activate the created profile after creation", default=True, show_default=True
 )
+@click.option(
+    "--use-keychain", is_flag=True, help="Store API key in system keychain instead of config file", default=False
+)
 @click.pass_obj
-def cmd_profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate: bool) -> None:
+def cmd_profile_create(cfg: Config, name: str, api_url: str, api_key: str, activate: bool, use_keychain: bool) -> None:
     """Create a new profile."""
-    cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key)
+    cfg_profile = ConfigSchema(platform_url=api_url, api_key=api_key, use_keychain=use_keychain)
     cfg.add_profile(profile_name=name, profile=cfg_profile, set_active=activate)
     profile_show(cfg)

hafnia/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@ from importlib.metadata import version
 __package_name__ = "hafnia"
 __version__ = version(__package_name__)
-__dataset_format_version__ = "0.0.2"  # Hafnia dataset format version
+__dataset_format_version__ = "0.1.0"  # Hafnia dataset format version

hafnia/dataset/dataset_helpers.py CHANGED Viewed

@@ -38,12 +38,19 @@ def hash_from_bytes(data: bytes) -> str:
 def save_image_with_hash_name(image: np.ndarray, path_folder: Path) -> Path:
     pil_image = Image.fromarray(image)
+    path_image = save_pil_image_with_hash_name(pil_image, path_folder)
+    return path_image
+def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_skip: bool = True) -> Path:
     buffer = io.BytesIO()
-    pil_image.save(buffer, format="PNG")
+    image.save(buffer, format="PNG")
     hash_value = hash_from_bytes(buffer.getvalue())
     path_image = Path(path_folder) / relative_path_from_hash(hash=hash_value, suffix=".png")
+    if allow_skip and path_image.exists():
+        return path_image
     path_image.parent.mkdir(parents=True, exist_ok=True)
-    pil_image.save(path_image)
+    image.save(path_image)
     return path_image

hafnia/dataset/dataset_names.py CHANGED Viewed

@@ -49,7 +49,7 @@ class FieldName:
 class ColumnName:
     SAMPLE_INDEX: str = "sample_index"
-    FILE_NAME: str = "file_name"
+    FILE_PATH: str = "file_path"
     HEIGHT: str = "height"
     WIDTH: str = "width"
     SPLIT: str = "split"
@@ -57,6 +57,7 @@ class ColumnName:
     ATTRIBUTION: str = "attribution"  # Attribution for the sample (image/video), e.g. creator, license, source, etc.
     TAGS: str = "tags"
     META: str = "meta"
+    DATASET_NAME: str = "dataset_name"
 class SplitName:

hafnia/dataset/dataset_recipe/dataset_recipe.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import json
 import os
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from pydantic import (
     field_serializer,
@@ -12,7 +12,11 @@ from pydantic import (
 from hafnia import utils
 from hafnia.dataset.dataset_recipe import recipe_transforms
-from hafnia.dataset.dataset_recipe.recipe_types import RecipeCreation, RecipeTransform, Serializable
+from hafnia.dataset.dataset_recipe.recipe_types import (
+    RecipeCreation,
+    RecipeTransform,
+    Serializable,
+)
 from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.dataset.primitives.primitive import Primitive
@@ -41,6 +45,17 @@ class DatasetRecipe(Serializable):
         creation = FromName(name=name, force_redownload=force_redownload, download_files=download_files)
         return DatasetRecipe(creation=creation)
+    @staticmethod
+    def from_name_public_dataset(
+        name: str, force_redownload: bool = False, n_samples: Optional[int] = None
+    ) -> DatasetRecipe:
+        creation = FromNamePublicDataset(
+            name=name,
+            force_redownload=force_redownload,
+            n_samples=n_samples,
+        )
+        return DatasetRecipe(creation=creation)
     @staticmethod
     def from_path(path_folder: Path, check_for_images: bool = True) -> DatasetRecipe:
         creation = FromPath(path_folder=path_folder, check_for_images=check_for_images)
@@ -222,7 +237,7 @@ class DatasetRecipe(Serializable):
         """Serialize the dataset recipe to a dictionary."""
         return self.model_dump(mode="json")
-    def as_platform_recipe(self, recipe_name: Optional[str]) -> Dict:
+    def as_platform_recipe(self, recipe_name: Optional[str], overwrite: bool = False) -> Dict:
         """Uploads dataset recipe to the hafnia platform."""
         from cli.config import Config
         from hafnia.platform.dataset_recipe import get_or_create_dataset_recipe
@@ -235,6 +250,7 @@ class DatasetRecipe(Serializable):
             endpoint=endpoint_dataset,
             api_key=cfg.api_key,
             name=recipe_name,
+            overwrite=overwrite,
         )
         return recipe_dict
@@ -246,10 +262,17 @@ class DatasetRecipe(Serializable):
         return recipe
     def select_samples(
-        recipe: DatasetRecipe, n_samples: int, shuffle: bool = True, seed: int = 42, with_replacement: bool = False
+        recipe: DatasetRecipe,
+        n_samples: int,
+        shuffle: bool = True,
+        seed: int = 42,
+        with_replacement: bool = False,
     ) -> DatasetRecipe:
         operation = recipe_transforms.SelectSamples(
-            n_samples=n_samples, shuffle=shuffle, seed=seed, with_replacement=with_replacement
+            n_samples=n_samples,
+            shuffle=shuffle,
+            seed=seed,
+            with_replacement=with_replacement,
         )
         recipe.append_operation(operation)
         return recipe
@@ -273,7 +296,7 @@ class DatasetRecipe(Serializable):
     def class_mapper(
         recipe: DatasetRecipe,
-        class_mapping: Dict[str, str],
+        class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
         method: str = "strict",
         primitive: Optional[Type[Primitive]] = None,
         task_name: Optional[str] = None,
@@ -400,6 +423,22 @@ class FromName(RecipeCreation):
         return [self.name]
+class FromNamePublicDataset(RecipeCreation):
+    name: str
+    force_redownload: bool = False
+    n_samples: Optional[int] = None
+    @staticmethod
+    def get_function() -> Callable[..., "HafniaDataset"]:
+        return HafniaDataset.from_name_public_dataset
+    def as_short_name(self) -> str:
+        return f"Torchvision('{self.name}')"
+    def get_dataset_names(self) -> List[str]:
+        return []
 class FromMerge(RecipeCreation):
     recipe0: DatasetRecipe
     recipe1: DatasetRecipe
@@ -414,7 +453,10 @@ class FromMerge(RecipeCreation):
     def get_dataset_names(self) -> List[str]:
         """Get the dataset names from the merged recipes."""
-        names = [*self.recipe0.creation.get_dataset_names(), *self.recipe1.creation.get_dataset_names()]
+        names = [
+            *self.recipe0.creation.get_dataset_names(),
+            *self.recipe1.creation.get_dataset_names(),
+        ]
         return names
@@ -439,33 +481,3 @@ class FromMerger(RecipeCreation):
         for recipe in self.recipes:
             names.extend(recipe.creation.get_dataset_names())
         return names
-def extract_dataset_names_from_json_dict(data: dict) -> list[str]:
-    """
-    Extract dataset names recursively from a JSON dictionary added with 'from_name'.
-    Even if the same functionality is achieved with `DatasetRecipe.get_dataset_names()`,
-    we want to keep this function in 'dipdatalib' to extract dataset names from json dictionaries
-    directly.
-    """
-    creation_field = data.get("creation")
-    if creation_field is None:
-        return []
-    if creation_field.get("__type__") == "FromName":
-        return [creation_field["name"]]
-    elif creation_field.get("__type__") == "FromMerge":
-        recipe_names = ["recipe0", "recipe1"]
-        dataset_name = []
-        for recipe_name in recipe_names:
-            recipe = creation_field.get(recipe_name)
-            if recipe is None:
-                continue
-            dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
-        return dataset_name
-    elif creation_field.get("__type__") == "FromMerger":
-        dataset_name = []
-        for recipe in creation_field.get("recipes", []):
-            dataset_name.extend(extract_dataset_names_from_json_dict(recipe))
-        return dataset_name
-    return []

hafnia/dataset/dataset_recipe/recipe_transforms.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Callable, Dict, List, Optional, Type, Union
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from pydantic import field_validator
 from hafnia.dataset.dataset_recipe.recipe_types import RecipeTransform
 from hafnia.dataset.hafnia_dataset import HafniaDataset
@@ -52,11 +54,25 @@ class DefineSampleSetBySize(RecipeTransform):
 class ClassMapper(RecipeTransform):
-    class_mapping: Dict[str, str]
+    class_mapping: Union[Dict[str, str], List[Tuple[str, str]]]
     method: str = "strict"
     primitive: Optional[Type[Primitive]] = None
     task_name: Optional[str] = None
+    @field_validator("class_mapping", mode="after")
+    @classmethod
+    def serialize_class_mapping(cls, value: Union[Dict[str, str], List[Tuple[str, str]]]) -> List[Tuple[str, str]]:
+        # Converts the dictionary class mapping to a list of tuples
+        #  e.g. {"old_class": "new_class", } --> [("old_class", "new_class")]
+        # The reason is that storing class mappings as a dictionary does not preserve order of json fields
+        # when stored in a database as a jsonb field (postgres).
+        # Preserving order of class mapping fields is important as it defines the indices of the classes.
+        # So to ensure that class indices are maintained, we preserve order of json fields, by converting the
+        # dictionary to a list of tuples.
+        if isinstance(value, dict):
+            value = list(value.items())
+        return value
     @staticmethod
     def get_function() -> Callable[..., "HafniaDataset"]:
         return HafniaDataset.class_mapper

hafnia/dataset/dataset_upload_helper.py CHANGED Viewed

@@ -4,7 +4,7 @@ import base64
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 import boto3
 import polars as pl
@@ -52,6 +52,7 @@ class DbDataset(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     license_citation: Optional[str] = None
     version: Optional[str] = None
     s3_bucket_name: Optional[str] = None
+    dataset_format_version: Optional[str] = None
     annotation_date: Optional[datetime] = None
     annotation_project_id: Optional[str] = None
     annotation_dataset_id: Optional[str] = None
@@ -186,9 +187,58 @@ class EntityTypeChoices(str, Enum):  # Should match `EntityTypeChoices` in `dipd
     EVENT = "EVENT"
+class Annotations(BaseModel):
+    """
+    Used in 'DatasetImageMetadata' for visualizing image annotations
+    in gallery images on the dataset detail page.
+    """
+    objects: Optional[List[Bbox]] = None
+    classifications: Optional[List[Classification]] = None
+    polygons: Optional[List[Polygon]] = None
+    bitmasks: Optional[List[Bitmask]] = None
+class DatasetImageMetadata(BaseModel):
+    """
+    Metadata for gallery images on the dataset detail page on portal.
+    """
+    annotations: Optional[Annotations] = None
+    meta: Optional[Dict[str, Any]] = None
+    @classmethod
+    def from_sample(cls, sample: Sample) -> "DatasetImageMetadata":
+        sample = sample.model_copy(deep=True)
+        sample.file_path = "/".join(Path(sample.file_path).parts[-3:])
+        metadata = {}
+        metadata_field_names = [
+            ColumnName.FILE_PATH,
+            ColumnName.HEIGHT,
+            ColumnName.WIDTH,
+            ColumnName.SPLIT,
+        ]
+        for field_name in metadata_field_names:
+            if hasattr(sample, field_name) and getattr(sample, field_name) is not None:
+                metadata[field_name] = getattr(sample, field_name)
+        obj = DatasetImageMetadata(
+            annotations=Annotations(
+                objects=sample.objects,
+                classifications=sample.classifications,
+                polygons=sample.polygons,
+                bitmasks=sample.bitmasks,
+            ),
+            meta=metadata,
+        )
+        return obj
 class DatasetImage(Attribution, validate_assignment=True):  # type: ignore[call-arg]
     img: str  # Base64-encoded image string
     order: Optional[int] = None
+    metadata: Optional[DatasetImageMetadata] = None
     @field_validator("img", mode="before")
     def validate_image_path(cls, v: Union[str, Path]) -> str:
@@ -254,7 +304,7 @@ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
     import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
     headers = {"Authorization": cfg.api_key}
-    user_logger.info("Importing dataset details. This may take up to 30 seconds...")
+    user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
     response = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
     return response  # type: ignore[return-value]
@@ -569,7 +619,9 @@ def dataset_info_from_dataset(
         s3_bucket_name=bucket_sample,
         dataset_variants=dataset_variants,
         split_annotations_reports=dataset_reports,
-        license_citation=dataset_meta_info.get("license_citation", None),
+        latest_update=dataset.info.updated_at,
+        dataset_format_version=dataset.info.format_version,
+        license_citation=dataset.info.reference_bibtex,
         data_captured_start=dataset_meta_info.get("data_captured_start", None),
         data_captured_end=dataset_meta_info.get("data_captured_end", None),
         data_received_start=dataset_meta_info.get("data_received_start", None),
@@ -594,7 +646,7 @@ def create_gallery_images(
         path_gallery_images.mkdir(parents=True, exist_ok=True)
         COL_IMAGE_NAME = "image_name"
         samples = dataset.samples.with_columns(
-            dataset.samples[ColumnName.FILE_NAME].str.split("/").list.last().alias(COL_IMAGE_NAME)
+            dataset.samples[ColumnName.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
         )
         gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))
@@ -604,6 +656,9 @@ def create_gallery_images(
         gallery_images = []
         for gallery_sample in gallery_samples.iter_rows(named=True):
             sample = Sample(**gallery_sample)
+            metadata = DatasetImageMetadata.from_sample(sample=sample)
+            sample.classifications = None  # To not draw classifications in gallery images
             image = sample.draw_annotations()
             path_gallery_image = path_gallery_images / gallery_sample[COL_IMAGE_NAME]
@@ -611,6 +666,7 @@ def create_gallery_images(
             dataset_image_dict = {
                 "img": path_gallery_image,
+                "metadata": metadata,
             }
             if sample.attribution is not None:
                 sample.attribution.changes = "Annotations have been visualized"

hafnia/dataset/format_conversions/image_classification_from_directory.py ADDED Viewed

@@ -0,0 +1,106 @@
+import shutil
+from pathlib import Path
+from typing import List, Optional
+import more_itertools
+import polars as pl
+from PIL import Image
+from rich.progress import track
+from hafnia.dataset.dataset_names import ColumnName, FieldName
+from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.primitives import Classification
+from hafnia.utils import is_image_file
+def import_image_classification_directory_tree(
+    path_folder: Path,
+    split: str,
+    n_samples: Optional[int] = None,
+) -> HafniaDataset:
+    class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
+    class_names = sorted([folder.name for folder in class_folder_paths])  # Sort for determinism
+    # Gather all image paths per class
+    path_images_per_class: List[List[Path]] = []
+    for path_class_folder in class_folder_paths:
+        per_class_images = []
+        for path_image in list(path_class_folder.rglob("*.*")):
+            if is_image_file(path_image):
+                per_class_images.append(path_image)
+        path_images_per_class.append(sorted(per_class_images))
+    # Interleave to ensure classes are balanced in the output dataset for n_samples < total
+    path_images = list(more_itertools.interleave_longest(*path_images_per_class))
+    if n_samples is not None:
+        path_images = path_images[:n_samples]
+    samples = []
+    for path_image_org in track(path_images, description="Convert 'image classification' dataset to Hafnia Dataset"):
+        class_name = path_image_org.parent.name
+        read_image = Image.open(path_image_org)
+        width, height = read_image.size
+        classifications = [Classification(class_name=class_name, class_idx=class_names.index(class_name))]
+        sample = Sample(
+            file_path=str(path_image_org.absolute()),
+            width=width,
+            height=height,
+            split=split,
+            classifications=classifications,
+        )
+        samples.append(sample)
+    dataset_info = DatasetInfo(
+        dataset_name="ImageClassificationFromDirectoryTree",
+        tasks=[TaskInfo(primitive=Classification, class_names=class_names)],
+    )
+    hafnia_dataset = HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
+    return hafnia_dataset
+def export_image_classification_directory_tree(
+    dataset: HafniaDataset,
+    path_output: Path,
+    task_name: Optional[str] = None,
+    clean_folder: bool = False,
+) -> Path:
+    task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
+    samples = dataset.samples.with_columns(
+        pl.col(task.primitive.column_name())
+        .list.filter(pl.element().struct.field(FieldName.TASK_NAME) == task.name)
+        .alias(task.primitive.column_name())
+    )
+    classification_counts = samples[task.primitive.column_name()].list.len()
+    has_no_classification_samples = (classification_counts == 0).sum()
+    if has_no_classification_samples > 0:
+        raise ValueError(f"Some samples do not have a classification for task '{task.name}'.")
+    has_multi_classification_samples = (classification_counts > 1).sum()
+    if has_multi_classification_samples > 0:
+        raise ValueError(f"Some samples have multiple classifications for task '{task.name}'.")
+    if clean_folder:
+        shutil.rmtree(path_output, ignore_errors=True)
+    path_output.mkdir(parents=True, exist_ok=True)
+    description = "Export Hafnia Dataset to directory tree"
+    for sample_dict in track(samples.iter_rows(named=True), total=len(samples), description=description):
+        classifications = sample_dict[task.primitive.column_name()]
+        if len(classifications) != 1:
+            raise ValueError("Each sample should have exactly one classification.")
+        classification = classifications[0]
+        class_name = classification[FieldName.CLASS_NAME].replace("/", "_")  # Avoid issues with subfolders
+        path_class_folder = path_output / class_name
+        path_class_folder.mkdir(parents=True, exist_ok=True)
+        path_image_org = Path(sample_dict[ColumnName.FILE_PATH])
+        path_image_new = path_class_folder / path_image_org.name
+        shutil.copy2(path_image_org, path_image_new)
+    return path_output

hafnia 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

hafnia 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl