PyPI - hafnia - Versions diffs - 0.4.0__tar.gz → 0.4.1__tar.gz - Mend

hafnia 0.4.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

{hafnia-0.4.0 → hafnia-0.4.1}/.github/workflows/build.yaml RENAMED Viewed

@@ -29,7 +29,7 @@ jobs:
           echo "package_version=$VERSION" >> $GITHUB_OUTPUT
       - name: Install uv
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v7
         with:
           version: 0.6.8
@@ -45,7 +45,7 @@ jobs:
         run: uv build
       - name: Upload package artifact
-        uses: actions/upload-artifact@v4.6.2
+        uses: actions/upload-artifact@v5.0.0
         with:
           name: python-package
           path: dist/

{hafnia-0.4.0 → hafnia-0.4.1}/.github/workflows/check_release.yaml RENAMED Viewed

@@ -20,7 +20,7 @@ jobs:
       make_release: ${{ steps.check_release.outputs.make_release }}
     steps:
       - name: Download package artifact
-        uses: actions/download-artifact@v5.0.0
+        uses: actions/download-artifact@v6.0.0
         with:
             name: python-package
             path: dist/

{hafnia-0.4.0 → hafnia-0.4.1}/.github/workflows/publish_docker.yaml RENAMED Viewed

@@ -31,7 +31,7 @@ jobs:
           python-version-file: ${{ inputs.python-version-file }}
       - name: Download package artifact
-        uses: actions/download-artifact@v5.0.0
+        uses: actions/download-artifact@v6.0.0
         with:
           name: python-package
           path: dist/

{hafnia-0.4.0 → hafnia-0.4.1}/.github/workflows/publish_pypi.yaml RENAMED Viewed

@@ -17,7 +17,7 @@ jobs:
       contents: read
     steps:
       - name: Download package artifact
-        uses: actions/download-artifact@v5.0.0
+        uses: actions/download-artifact@v6.0.0
         with:
           name: python-package
           path: dist/

{hafnia-0.4.0 → hafnia-0.4.1}/.github/workflows/tests.yaml RENAMED Viewed

@@ -20,7 +20,7 @@ jobs:
         with:
           python-version-file: ${{ inputs.python-version-file }}
       - name: Install uv
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v7
         with:
           version: 0.6.8
       - name: Install the project

{hafnia-0.4.0 → hafnia-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hafnia
-Version: 0.4.0
+Version: 0.4.1
 Summary: Python SDK for communication with Hafnia platform.
 Author-email: Milestone Systems <hafniaplatform@milestone.dk>
 License-File: LICENSE
@@ -158,7 +158,7 @@ and `dataset.samples` with annotations as a polars DataFrame
 print(dataset.samples.head(2))
 shape: (2, 14)
 ┌──────────────┬─────────────────────────────────┬────────┬───────┬───┬─────────────────────────────────┬──────────┬──────────┬─────────────────────────────────┐
-│ sample_index ┆ file_name                       ┆ height ┆ width ┆ … ┆ objects                         ┆ bitmasks ┆ polygons ┆ meta                            │
+│ sample_index ┆ file_name                       ┆ height ┆ width ┆ … ┆ bboxes                          ┆ bitmasks ┆ polygons ┆ meta                            │
 │ ---          ┆ ---                             ┆ ---    ┆ ---   ┆   ┆ ---                             ┆ ---      ┆ ---      ┆ ---                             │
 │ u32          ┆ str                             ┆ i64    ┆ i64   ┆   ┆ list[struct[11]]                ┆ null     ┆ null     ┆ struct[5]                       │
 ╞══════════════╪═════════════════════════════════╪════════╪═══════╪═══╪═════════════════════════════════╪══════════╪══════════╪═════════════════════════════════╡
@@ -218,7 +218,7 @@ sample_dict = dataset[0]
 for sample_dict in dataset:
     sample = Sample(**sample_dict)
-    print(sample.sample_id, sample.objects)
+    print(sample.sample_id, sample.bboxes)
     break
 ```
 Not that it is possible to create a `Sample` object from the sample dictionary.
@@ -421,7 +421,7 @@ pil_image.save("visualized_labels.png")
 # Create DataLoaders - using TorchVisionCollateFn
 collate_fn = torch_helpers.TorchVisionCollateFn(
-    skip_stacking=["objects.bbox", "objects.class_idx"]
+    skip_stacking=["bboxes.bbox", "bboxes.class_idx"]
 )
 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
 ```

{hafnia-0.4.0 → hafnia-0.4.1}/README.md RENAMED Viewed

@@ -129,7 +129,7 @@ and `dataset.samples` with annotations as a polars DataFrame
 print(dataset.samples.head(2))
 shape: (2, 14)
 ┌──────────────┬─────────────────────────────────┬────────┬───────┬───┬─────────────────────────────────┬──────────┬──────────┬─────────────────────────────────┐
-│ sample_index ┆ file_name                       ┆ height ┆ width ┆ … ┆ objects                         ┆ bitmasks ┆ polygons ┆ meta                            │
+│ sample_index ┆ file_name                       ┆ height ┆ width ┆ … ┆ bboxes                          ┆ bitmasks ┆ polygons ┆ meta                            │
 │ ---          ┆ ---                             ┆ ---    ┆ ---   ┆   ┆ ---                             ┆ ---      ┆ ---      ┆ ---                             │
 │ u32          ┆ str                             ┆ i64    ┆ i64   ┆   ┆ list[struct[11]]                ┆ null     ┆ null     ┆ struct[5]                       │
 ╞══════════════╪═════════════════════════════════╪════════╪═══════╪═══╪═════════════════════════════════╪══════════╪══════════╪═════════════════════════════════╡
@@ -189,7 +189,7 @@ sample_dict = dataset[0]
 for sample_dict in dataset:
     sample = Sample(**sample_dict)
-    print(sample.sample_id, sample.objects)
+    print(sample.sample_id, sample.bboxes)
     break
 ```
 Not that it is possible to create a `Sample` object from the sample dictionary.
@@ -392,7 +392,7 @@ pil_image.save("visualized_labels.png")
 # Create DataLoaders - using TorchVisionCollateFn
 collate_fn = torch_helpers.TorchVisionCollateFn(
-    skip_stacking=["objects.bbox", "objects.class_idx"]
+    skip_stacking=["bboxes.bbox", "bboxes.class_idx"]
 )
 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
 ```

{hafnia-0.4.0 → hafnia-0.4.1}/examples/example_dataset_recipe.py RENAMED Viewed

@@ -129,26 +129,28 @@ mapping_midwest = {
     "Vehicle*": "Vehicle",  # Wildcard mapping. Selects class names starting with 'Vehicle.' e.g. 'Vehicle.Bicycle', "Vehicle.Car', etc.
     "Vehicle.Trailer": OPS_REMOVE_CLASS,  # Use this to remove a class
 }
-coco_remapped = coco.class_mapper(class_mapping=mappings_coco, method="remove_undefined", task_name="bboxes")
-midwest_remapped = midwest.class_mapper(class_mapping=mapping_midwest, task_name="bboxes")
+coco_remapped = coco.class_mapper(class_mapping=mappings_coco, method="remove_undefined", task_name="object_detection")
+midwest_remapped = midwest.class_mapper(class_mapping=mapping_midwest, task_name="object_detection")
 # 2b) Merge datasets
 merged_dataset_all_images = HafniaDataset.from_merge(dataset0=coco_remapped, dataset1=midwest_remapped)
 # 2c) Remove images without 'Person' or 'Vehicle' annotations
-merged_dataset = merged_dataset_all_images.select_samples_by_class_name(name=["Person", "Vehicle"], task_name="bboxes")
+merged_dataset = merged_dataset_all_images.select_samples_by_class_name(
+    name=["Person", "Vehicle"], task_name="object_detection"
+)
 merged_dataset.print_stats()
 # 3) Once you have verified operations using the 'HafniaDataset' interface, you can convert
 # the operations to a single 'DatasetRecipe'
 merged_recipe = DatasetRecipe.from_merge(
     recipe0=DatasetRecipe.from_name("coco-2017").class_mapper(
-        class_mapping=mappings_coco, method="remove_undefined", task_name="bboxes"
+        class_mapping=mappings_coco, method="remove_undefined", task_name="object_detection"
     ),
     recipe1=DatasetRecipe.from_name("midwest-vehicle-detection").class_mapper(
-        class_mapping=mapping_midwest, task_name="bboxes"
+        class_mapping=mapping_midwest, task_name="object_detection"
     ),
-).select_samples_by_class_name(name=["Person", "Vehicle"], task_name="bboxes")
+).select_samples_by_class_name(name=["Person", "Vehicle"], task_name="object_detection")
 # 3a) Verify again on the sample datasets, that the recipe works and can build as a dataset
 merged_dataset = merged_recipe.build()

{hafnia-0.4.0 → hafnia-0.4.1}/examples/example_hafnia_dataset.py RENAMED Viewed

@@ -33,8 +33,8 @@ dataset.print_class_distribution()
 dataset.print_stats()  # Print verbose dataset statistics
 # Get dataset stats
-dataset.class_counts_all()  # Get class counts for all tasks
-dataset.class_counts_for_task(primitive=Classification)  # Get class counts for a specific task
+dataset.calculate_class_counts()  # Get class counts for all tasks
+dataset.calculate_task_class_counts(primitive=Classification)  # Get class counts for a specific task
 # Create a dataset split for training
 dataset_train = dataset.create_split_dataset("train")
@@ -86,9 +86,19 @@ dataset.write(path_dataset)
 # Load dataset from disk
 dataset_again = HafniaDataset.from_path(path_dataset)
+## Dataset importers and exporters ##
+dataset_coco = HafniaDataset.from_name("coco-2017").select_samples(n_samples=5, seed=42)
+path_yolo_format = Path(".data/tmp/yolo_dataset")
+# Export dataset to YOLO format
+dataset_coco.to_yolo_format(path_export_yolo_dataset=path_yolo_format)
+# Import dataset from YOLO format
+dataset_coco_imported = HafniaDataset.from_yolo_format(path_yolo_format)
+## Custom dataset operations and statistics ##
 # Want custom dataset transformations or statistics? Use the polars table (dataset.samples) directly
-n_objects = dataset.samples["objects"].list.len().sum()
+n_objects = dataset.samples["bboxes"].list.len().sum()
 n_objects = dataset.samples[Bbox.column_name()].list.len().sum()  # Use Bbox.column_name() to avoid magic variables
 n_classifications = dataset.samples[Classification.column_name()].list.len().sum()
@@ -106,7 +116,7 @@ for sample_dict in dataset_train:
 # Unpack dict into a Sample-object! Important for data validation, useability, IDE completion and mypy hints
 sample: Sample = Sample(**sample_dict)
-objects: List[Bbox] = sample.objects  # Use 'sample.objects' access bounding boxes as a list of Bbox objects
+bboxes: List[Bbox] = sample.bboxes  # Use 'sample.bboxes' access bounding boxes as a list of Bbox objects
 bitmasks: List[Bitmask] = sample.bitmasks  # Use 'sample.bitmasks' to access bitmasks as a list of Bitmask objects
 polygons: List[Polygon] = sample.polygons  # Use 'sample.polygons' to access polygons as a list of Polygon objects
 classifications: List[Classification] = sample.classifications  # As a list of Classification objects
@@ -134,7 +144,7 @@ for i_fake_sample in range(5):
         width=640,
         split="train",
         tags=["sample"],
-        objects=bboxes,
+        bboxes=bboxes,
         classifications=classifications,
     )
     fake_samples.append(sample)

{hafnia-0.4.0 → hafnia-0.4.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hafnia"
-version = "0.4.0"
+version = "0.4.1"
 description = "Python SDK for communication with Hafnia platform."
 readme = "README.md"
 authors = [

{hafnia-0.4.0 → hafnia-0.4.1}/src/hafnia/__init__.py RENAMED Viewed

@@ -3,4 +3,4 @@ from importlib.metadata import version
 __package_name__ = "hafnia"
 __version__ = version(__package_name__)
-__dataset_format_version__ = "0.1.0"  # Hafnia dataset format version
+__dataset_format_version__ = "0.2.0"  # Hafnia dataset format version

hafnia-0.4.1/src/hafnia/dataset/dataset_names.py ADDED Viewed

@@ -0,0 +1,190 @@
+from enum import Enum
+from typing import Dict, List, Optional
+import boto3
+from pydantic import BaseModel, field_validator
+FILENAME_RECIPE_JSON = "recipe.json"
+FILENAME_DATASET_INFO = "dataset_info.json"
+FILENAME_ANNOTATIONS_JSONL = "annotations.jsonl"
+FILENAME_ANNOTATIONS_PARQUET = "annotations.parquet"
+DATASET_FILENAMES_REQUIRED = [
+    FILENAME_DATASET_INFO,
+    FILENAME_ANNOTATIONS_JSONL,
+    FILENAME_ANNOTATIONS_PARQUET,
+]
+class DeploymentStage(Enum):
+    STAGING = "staging"
+    PRODUCTION = "production"
+TAG_IS_SAMPLE = "sample"
+OPS_REMOVE_CLASS = "__REMOVE__"
+class PrimitiveField:
+    CLASS_NAME: str = "class_name"  # Name of the class this primitive is associated with, e.g. "car" for Bbox
+    CLASS_IDX: str = "class_idx"  # Index of the class this primitive is associated with, e.g. 0 for "car" if it is the first class  # noqa: E501
+    OBJECT_ID: str = "object_id"  # Unique identifier for the object, e.g. "12345123"
+    CONFIDENCE: str = "confidence"  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
+    META: str = "meta"  # Contains metadata about each primitive, e.g. attributes color, occluded, iscrowd, etc.
+    TASK_NAME: str = "task_name"  # Name of the task this primitive is associated with, e.g. "bboxes" for Bbox
+    @staticmethod
+    def fields() -> List[str]:
+        """
+        Returns a list of expected field names for primitives.
+        """
+        return [
+            PrimitiveField.CLASS_NAME,
+            PrimitiveField.CLASS_IDX,
+            PrimitiveField.OBJECT_ID,
+            PrimitiveField.CONFIDENCE,
+            PrimitiveField.META,
+            PrimitiveField.TASK_NAME,
+        ]
+class SampleField:
+    FILE_PATH: str = "file_path"
+    HEIGHT: str = "height"
+    WIDTH: str = "width"
+    SPLIT: str = "split"
+    TAGS: str = "tags"
+    CLASSIFICATIONS: str = "classifications"
+    BBOXES: str = "bboxes"
+    BITMASKS: str = "bitmasks"
+    POLYGONS: str = "polygons"
+    STORAGE_FORMAT: str = "storage_format"  # E.g. "image", "video", "zip"
+    COLLECTION_INDEX: str = "collection_index"
+    COLLECTION_ID: str = "collection_id"
+    REMOTE_PATH: str = "remote_path"  # Path to the file in remote storage, e.g. S3
+    SAMPLE_INDEX: str = "sample_index"
+    ATTRIBUTION: str = "attribution"  # Attribution for the sample (image/video), e.g. creator, license, source, etc.
+    META: str = "meta"
+    DATASET_NAME: str = "dataset_name"
+class StorageFormat:
+    IMAGE: str = "image"
+    VIDEO: str = "video"
+    ZIP: str = "zip"
+class SplitName:
+    TRAIN: str = "train"
+    VAL: str = "validation"
+    TEST: str = "test"
+    UNDEFINED: str = "UNDEFINED"
+    @staticmethod
+    def valid_splits() -> List[str]:
+        return [SplitName.TRAIN, SplitName.VAL, SplitName.TEST]
+    @staticmethod
+    def all_split_names() -> List[str]:
+        return [*SplitName.valid_splits(), SplitName.UNDEFINED]
+class DatasetVariant(Enum):
+    DUMP = "dump"
+    SAMPLE = "sample"
+    HIDDEN = "hidden"
+class AwsCredentials(BaseModel):
+    access_key: str
+    secret_key: str
+    session_token: str
+    region: Optional[str]
+    def aws_credentials(self) -> Dict[str, str]:
+        """
+        Returns the AWS credentials as a dictionary.
+        """
+        environment_vars = {
+            "AWS_ACCESS_KEY_ID": self.access_key,
+            "AWS_SECRET_ACCESS_KEY": self.secret_key,
+            "AWS_SESSION_TOKEN": self.session_token,
+        }
+        if self.region:
+            environment_vars["AWS_REGION"] = self.region
+        return environment_vars
+    @staticmethod
+    def from_session(session: boto3.Session) -> "AwsCredentials":
+        """
+        Creates AwsCredentials from a Boto3 session.
+        """
+        frozen_credentials = session.get_credentials().get_frozen_credentials()
+        return AwsCredentials(
+            access_key=frozen_credentials.access_key,
+            secret_key=frozen_credentials.secret_key,
+            session_token=frozen_credentials.token,
+            region=session.region_name,
+        )
+ARN_PREFIX = "arn:aws:s3:::"
+class ResourceCredentials(AwsCredentials):
+    s3_arn: str
+    @staticmethod
+    def fix_naming(payload: Dict[str, str]) -> "ResourceCredentials":
+        """
+        The endpoint returns a payload with a key called 's3_path', but it
+        is actually an ARN path (starts with arn:aws:s3::). This method renames it to 's3_arn' for consistency.
+        """
+        if "s3_path" in payload and payload["s3_path"].startswith(ARN_PREFIX):
+            payload["s3_arn"] = payload.pop("s3_path")
+        if "region" not in payload:
+            payload["region"] = "eu-west-1"
+        return ResourceCredentials(**payload)
+    @field_validator("s3_arn")
+    @classmethod
+    def validate_s3_arn(cls, value: str) -> str:
+        """Validate s3_arn to ensure it starts with 'arn:aws:s3:::'"""
+        if not value.startswith("arn:aws:s3:::"):
+            raise ValueError(f"Invalid S3 ARN: {value}. It should start with 'arn:aws:s3:::'")
+        return value
+    def s3_path(self) -> str:
+        """
+        Extracts the S3 path from the ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket/my-prefix
+        """
+        return self.s3_arn[len(ARN_PREFIX) :]
+    def s3_uri(self) -> str:
+        """
+        Converts the S3 ARN to a URI format.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> s3://my-bucket/my-prefix
+        """
+        return f"s3://{self.s3_path()}"
+    def bucket_name(self) -> str:
+        """
+        Extracts the bucket name from the S3 ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-bucket
+        """
+        return self.s3_path().split("/")[0]
+    def object_key(self) -> str:
+        """
+        Extracts the object key from the S3 ARN.
+        Example: arn:aws:s3:::my-bucket/my-prefix -> my-prefix
+        """
+        return "/".join(self.s3_path().split("/")[1:])

{hafnia-0.4.0 → hafnia-0.4.1}/src/hafnia/dataset/dataset_upload_helper.py RENAMED Viewed

@@ -14,10 +14,10 @@ from pydantic import BaseModel, ConfigDict, field_validator
 from cli.config import Config
 from hafnia.dataset import primitives
 from hafnia.dataset.dataset_names import (
-    ColumnName,
     DatasetVariant,
     DeploymentStage,
-    FieldName,
+    PrimitiveField,
+    SampleField,
     SplitName,
 )
 from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
@@ -193,7 +193,7 @@ class Annotations(BaseModel):
     in gallery images on the dataset detail page.
     """
-    objects: Optional[List[Bbox]] = None
+    bboxes: Optional[List[Bbox]] = None
     classifications: Optional[List[Classification]] = None
     polygons: Optional[List[Polygon]] = None
     bitmasks: Optional[List[Bitmask]] = None
@@ -210,13 +210,15 @@ class DatasetImageMetadata(BaseModel):
     @classmethod
     def from_sample(cls, sample: Sample) -> "DatasetImageMetadata":
         sample = sample.model_copy(deep=True)
+        if sample.file_path is None:
+            raise ValueError("Sample has no file_path defined.")
         sample.file_path = "/".join(Path(sample.file_path).parts[-3:])
         metadata = {}
         metadata_field_names = [
-            ColumnName.FILE_PATH,
-            ColumnName.HEIGHT,
-            ColumnName.WIDTH,
-            ColumnName.SPLIT,
+            SampleField.FILE_PATH,
+            SampleField.HEIGHT,
+            SampleField.WIDTH,
+            SampleField.SPLIT,
         ]
         for field_name in metadata_field_names:
             if hasattr(sample, field_name) and getattr(sample, field_name) is not None:
@@ -224,7 +226,7 @@ class DatasetImageMetadata(BaseModel):
         obj = DatasetImageMetadata(
             annotations=Annotations(
-                objects=sample.objects,
+                bboxes=sample.bboxes,
                 classifications=sample.classifications,
                 polygons=sample.polygons,
                 bitmasks=sample.bitmasks,
@@ -343,13 +345,13 @@ def calculate_distribution_values(
     classifications = dataset_split.select(pl.col(classification_column).explode())
     classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
     classifications = classifications.filter(
-        pl.col(FieldName.TASK_NAME).is_in([task.name for task in distribution_tasks])
+        pl.col(PrimitiveField.TASK_NAME).is_in([task.name for task in distribution_tasks])
     )
     dist_values = []
-    for (task_name,), task_group in classifications.group_by(FieldName.TASK_NAME):
+    for (task_name,), task_group in classifications.group_by(PrimitiveField.TASK_NAME):
         distribution_type = DbDistributionType(name=task_name)
         n_annotated_total = len(task_group)
-        for (class_name,), class_group in task_group.group_by(FieldName.CLASS_NAME):
+        for (class_name,), class_group in task_group.group_by(PrimitiveField.CLASS_NAME):
             class_count = len(class_group)
             dist_values.append(
@@ -383,6 +385,7 @@ def dataset_info_from_dataset(
     path_hidden: Optional[Path],
     path_gallery_images: Optional[Path] = None,
     gallery_image_names: Optional[List[str]] = None,
+    distribution_task_names: Optional[List[TaskInfo]] = None,
 ) -> DbDataset:
     dataset_variants = []
     dataset_reports = []
@@ -427,13 +430,15 @@ def dataset_info_from_dataset(
             )
         )
+        distribution_task_names = distribution_task_names or []
+        distribution_tasks = [t for t in dataset.info.tasks if t.name in distribution_task_names]
         for split_name in SplitChoices:
             split_names = SPLIT_CHOICE_MAPPING[split_name]
-            dataset_split = dataset_variant.samples.filter(pl.col(ColumnName.SPLIT).is_in(split_names))
+            dataset_split = dataset_variant.samples.filter(pl.col(SampleField.SPLIT).is_in(split_names))
             distribution_values = calculate_distribution_values(
                 dataset_split=dataset_split,
-                distribution_tasks=dataset.info.distributions,
+                distribution_tasks=distribution_tasks,
             )
             report = DbSplitAnnotationsReport(
                 variant_type=VARIANT_TYPE_MAPPING[variant_type],  # type: ignore[index]
@@ -461,7 +466,7 @@ def dataset_info_from_dataset(
                 annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
                 for (class_name, task_name), class_group in df_per_instance.group_by(
-                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                    PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
                 ):
                     if class_name is None:
                         continue
@@ -473,10 +478,10 @@ def dataset_info_from_dataset(
                                 annotation_type=annotation_type,
                                 task_name=task_name,
                             ),
-                            unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
+                            unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
-                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
                             area_avg_ratio=class_group["area"].mean(),
                             area_min_ratio=class_group["area"].min(),
                             area_max_ratio=class_group["area"].max(),
@@ -495,7 +500,7 @@ def dataset_info_from_dataset(
                             width_avg_px=class_group["width_px"].mean(),
                             width_min_px=int(class_group["width_px"].min()),
                             width_max_px=int(class_group["width_px"].max()),
-                            average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
                         )
                     )
@@ -509,13 +514,13 @@ def dataset_info_from_dataset(
                     # Include only classification tasks that are defined in the dataset info
                     classification_df = classification_df.filter(
-                        pl.col(FieldName.TASK_NAME).is_in(classification_tasks)
+                        pl.col(PrimitiveField.TASK_NAME).is_in(classification_tasks)
                     )
                     for (
                         task_name,
                         class_name,
-                    ), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
+                    ), class_group in classification_df.group_by(PrimitiveField.TASK_NAME, PrimitiveField.CLASS_NAME):
                         if class_name is None:
                             continue
                         if task_name == Classification.default_task_name():
@@ -544,7 +549,7 @@ def dataset_info_from_dataset(
             if has_primitive(dataset_split, PrimitiveType=Bitmask):
                 col_name = Bitmask.column_name()
                 drop_columns = [col for col in primitive_columns if col != col_name]
-                drop_columns.append(FieldName.META)
+                drop_columns.append(PrimitiveField.META)
                 df_per_instance = table_transformations.create_primitive_table(
                     dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
@@ -562,7 +567,7 @@ def dataset_info_from_dataset(
                 annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
                 for (class_name, task_name), class_group in df_per_instance.group_by(
-                    FieldName.CLASS_NAME, FieldName.TASK_NAME
+                    PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
                 ):
                     if class_name is None:
                         continue
@@ -574,11 +579,11 @@ def dataset_info_from_dataset(
                                 annotation_type=annotation_type,
                                 task_name=task_name,
                             ),
-                            unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
+                            unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
                             obj_instances=len(class_group),
                             annotation_type=[annotation_type],
-                            average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
-                            images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
+                            average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
+                            images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
                             area_avg_ratio=class_group["area"].mean(),
                             area_min_ratio=class_group["area"].min(),
                             area_max_ratio=class_group["area"].max(),
@@ -646,7 +651,7 @@ def create_gallery_images(
         path_gallery_images.mkdir(parents=True, exist_ok=True)
         COL_IMAGE_NAME = "image_name"
         samples = dataset.samples.with_columns(
-            dataset.samples[ColumnName.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
+            dataset.samples[SampleField.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
         )
         gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))

hafnia-0.4.0/src/hafnia/dataset/format_conversions/image_classification_from_directory.py → hafnia-0.4.1/src/hafnia/dataset/format_conversions/format_image_classification_folder.py RENAMED Viewed

@@ -1,23 +1,27 @@
 import shutil
 from pathlib import Path
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 import more_itertools
 import polars as pl
 from PIL import Image
 from rich.progress import track
-from hafnia.dataset.dataset_names import ColumnName, FieldName
-from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.dataset_names import PrimitiveField, SampleField
 from hafnia.dataset.primitives import Classification
 from hafnia.utils import is_image_file
+if TYPE_CHECKING:
+    from hafnia.dataset.hafnia_dataset import HafniaDataset
-def import_image_classification_directory_tree(
+def from_image_classification_folder(
     path_folder: Path,
     split: str,
     n_samples: Optional[int] = None,
-) -> HafniaDataset:
+) -> "HafniaDataset":
+    from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
     class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
     class_names = sorted([folder.name for folder in class_folder_paths])  # Sort for determinism
@@ -62,8 +66,8 @@ def import_image_classification_directory_tree(
     return hafnia_dataset
-def export_image_classification_directory_tree(
-    dataset: HafniaDataset,
+def to_image_classification_folder(
+    dataset: "HafniaDataset",
     path_output: Path,
     task_name: Optional[str] = None,
     clean_folder: bool = False,
@@ -72,7 +76,7 @@ def export_image_classification_directory_tree(
     samples = dataset.samples.with_columns(
         pl.col(task.primitive.column_name())
-        .list.filter(pl.element().struct.field(FieldName.TASK_NAME) == task.name)
+        .list.filter(pl.element().struct.field(PrimitiveField.TASK_NAME) == task.name)
         .alias(task.primitive.column_name())
     )
@@ -95,11 +99,11 @@ def export_image_classification_directory_tree(
         if len(classifications) != 1:
             raise ValueError("Each sample should have exactly one classification.")
         classification = classifications[0]
-        class_name = classification[FieldName.CLASS_NAME].replace("/", "_")  # Avoid issues with subfolders
+        class_name = classification[PrimitiveField.CLASS_NAME].replace("/", "_")  # Avoid issues with subfolders
         path_class_folder = path_output / class_name
         path_class_folder.mkdir(parents=True, exist_ok=True)
-        path_image_org = Path(sample_dict[ColumnName.FILE_PATH])
+        path_image_org = Path(sample_dict[SampleField.FILE_PATH])
         path_image_new = path_class_folder / path_image_org.name
         shutil.copy2(path_image_org, path_image_new)

hafnia 0.4.0__tar.gz → 0.4.1__tar.gz

hafnia 0.4.0tar.gz → 0.4.1tar.gz