PyPI - hafnia - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

hafnia 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cli/__main__.py +3 -1
cli/config.py +43 -3
cli/keychain.py +88 -0
cli/profile_cmds.py +5 -2
hafnia/__init__.py +1 -1
hafnia/dataset/dataset_helpers.py +9 -2
hafnia/dataset/dataset_names.py +130 -16
hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
hafnia/dataset/dataset_upload_helper.py +83 -22
hafnia/dataset/format_conversions/format_image_classification_folder.py +110 -0
hafnia/dataset/format_conversions/format_yolo.py +164 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +287 -0
hafnia/dataset/hafnia_dataset.py +396 -96
hafnia/dataset/operations/dataset_stats.py +84 -73
hafnia/dataset/operations/dataset_transformations.py +116 -47
hafnia/dataset/operations/table_transformations.py +135 -17
hafnia/dataset/primitives/bbox.py +25 -14
hafnia/dataset/primitives/bitmask.py +22 -15
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +15 -10
hafnia/dataset/primitives/primitive.py +1 -1
hafnia/dataset/primitives/segmentation.py +12 -9
hafnia/experiment/hafnia_logger.py +0 -9
hafnia/platform/dataset_recipe.py +7 -2
hafnia/platform/datasets.py +5 -9
hafnia/platform/download.py +24 -90
hafnia/torch_helpers.py +12 -12
hafnia/utils.py +17 -0
hafnia/visualizations/image_visualizations.py +3 -1
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/METADATA +11 -9
hafnia-0.4.1.dist-info/RECORD +57 -0
hafnia-0.3.0.dist-info/RECORD +0 -53
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/WHEEL +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/entry_points.txt +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/operations/table_transformations.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from pathlib import Path
-from typing import List, Optional, Type
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
 import polars as pl
-from tqdm import tqdm
+from rich.progress import track
 from hafnia.dataset.dataset_names import (
     FILENAME_ANNOTATIONS_JSONL,
     FILENAME_ANNOTATIONS_PARQUET,
-    ColumnName,
-    FieldName,
+    PrimitiveField,
+    SampleField,
 )
 from hafnia.dataset.operations import table_transformations
 from hafnia.dataset.primitives import PRIMITIVE_TYPES
@@ -16,9 +16,15 @@ from hafnia.dataset.primitives.classification import Classification
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.log import user_logger
+if TYPE_CHECKING:
+    from hafnia.dataset.hafnia_dataset import TaskInfo
 def create_primitive_table(
-    samples_table: pl.DataFrame, PrimitiveType: Type[Primitive], keep_sample_data: bool = False
+    samples_table: pl.DataFrame,
+    PrimitiveType: Type[Primitive],
+    keep_sample_data: bool = False,
+    task_name: Optional[str] = None,
 ) -> Optional[pl.DataFrame]:
     """
     Returns a DataFrame with objects of the specified primitive type.
@@ -48,6 +54,9 @@ def create_primitive_table(
         objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
     else:
         objects_df = remove_no_object_frames.select(pl.col(column_name).explode().struct.unnest())
+    if task_name is not None:
+        objects_df = objects_df.filter(pl.col(PrimitiveField.TASK_NAME) == task_name)
     return objects_df
@@ -55,11 +64,12 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
     has_same_schema = samples0.schema == samples1.schema
     if not has_same_schema:
         shared_columns = []
-        for column_name, column_type in samples0.schema.items():
+        for column_name, s0_column_type in samples0.schema.items():
             if column_name not in samples1.schema:
                 continue
+            samples0, samples1 = correction_of_list_struct_primitives(samples0, samples1, column_name)
-            if column_type != samples1.schema[column_name]:
+            if samples0.schema[column_name] != samples1.schema[column_name]:
                 continue
             shared_columns.append(column_name)
@@ -79,16 +89,58 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
         samples0 = samples0.select(list(shared_columns))
         samples1 = samples1.select(list(shared_columns))
     merged_samples = pl.concat([samples0, samples1], how="vertical")
-    merged_samples = merged_samples.drop(ColumnName.SAMPLE_INDEX).with_row_index(name=ColumnName.SAMPLE_INDEX)
+    merged_samples = add_sample_index(merged_samples)
     return merged_samples
+def correction_of_list_struct_primitives(
+    samples0: pl.DataFrame,
+    samples1: pl.DataFrame,
+    column_name: str,
+) -> Tuple[pl.DataFrame, pl.DataFrame]:
+    """
+    Corrects primitive columns (bboxes, polygons etc of type 'list[struct]') by removing non-matching struct fields
+    between two datasets. This is useful when merging two datasets with the same primitive (e.g. Bbox), where
+    some (less important) field types in the struct differ between the two datasets.
+    This issue often occurs with the 'meta' field as different dataset formats may store different metadata information.
+    """
+    s0_column_type = samples0.schema[column_name]
+    s1_column_type = samples1.schema[column_name]
+    is_list_structs = s1_column_type == pl.List(pl.Struct) and s0_column_type == pl.List(pl.Struct)
+    is_non_matching_types = s1_column_type != s0_column_type
+    if is_list_structs and is_non_matching_types:  # Only perform correction for list[struct] types that do not match
+        s0_fields = set(s0_column_type.inner.fields)
+        s1_fields = set(s1_column_type.inner.fields)
+        similar_fields = s0_fields.intersection(s1_fields)
+        s0_dropped_fields = s0_fields - similar_fields
+        if len(s0_dropped_fields) > 0:
+            samples0 = samples0.with_columns(
+                pl.col(column_name)
+                .list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
+                .alias(column_name)
+            )
+        s1_dropped_fields = s1_fields - similar_fields
+        if len(s1_dropped_fields) > 0:
+            samples1 = samples1.with_columns(
+                pl.col(column_name)
+                .list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
+                .alias(column_name)
+            )
+        user_logger.warning(
+            f"Primitive column '{column_name}' has none-matching fields in the two datasets. "
+            f"Dropping fields in samples0: {[f.name for f in s0_dropped_fields]}. "
+            f"Dropping fields in samples1: {[f.name for f in s1_dropped_fields]}."
+        )
+    return samples0, samples1
 def filter_table_for_class_names(
     samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
 ) -> Optional[pl.DataFrame]:
     table_with_selected_class_names = samples_table.filter(
         pl.col(PrimitiveType.column_name())
-        .list.eval(pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names))
+        .list.eval(pl.element().struct.field(PrimitiveField.CLASS_NAME).is_in(class_names))
         .list.any()
     )
@@ -100,20 +152,20 @@ def split_primitive_columns_by_task_name(
     coordinate_types: Optional[List[Type[Primitive]]] = None,
 ) -> pl.DataFrame:
     """
-    Convert Primitive columns such as "objects" (Bbox) into a column for each task name.
-    For example, if the "objects" column (containing Bbox objects) has tasks "task1" and "task2".
+    Convert Primitive columns such as "bboxes" (Bbox) into a column for each task name.
+    For example, if the "bboxes" column (containing Bbox objects) has tasks "task1" and "task2".
     This:
     ─┬────────────┬─
-     ┆ objects    ┆
+     ┆ bboxes    ┆
      ┆ ---        ┆
      ┆ list[struc ┆
      ┆ t[11]]     ┆
     ═╪════════════╪═
     becomes this:
     ─┬────────────┬────────────┬─
-     ┆ objects.   ┆ objects.   ┆
+     ┆ bboxes.   ┆ bboxes.   ┆
      ┆ task1      ┆ task2      ┆
      ┆ ---        ┆ ---        ┆
      ┆ list[struc ┆ list[struc ┆
@@ -131,11 +183,11 @@ def split_primitive_columns_by_task_name(
         if samples_table[col_name].dtype != pl.List(pl.Struct):
             continue
-        task_names = samples_table[col_name].explode().struct.field(FieldName.TASK_NAME).unique().to_list()
+        task_names = samples_table[col_name].explode().struct.field(PrimitiveField.TASK_NAME).unique().to_list()
         samples_table = samples_table.with_columns(
             [
                 pl.col(col_name)
-                .list.filter(pl.element().struct.field(FieldName.TASK_NAME).eq(task_name))
+                .list.filter(pl.element().struct.field(PrimitiveField.TASK_NAME).eq(task_name))
                 .alias(f"{col_name}.{task_name}")
                 for task_name in task_names
             ]
@@ -144,7 +196,7 @@ def split_primitive_columns_by_task_name(
     return samples_table
-def read_table_from_path(path: Path) -> pl.DataFrame:
+def read_samples_from_path(path: Path) -> pl.DataFrame:
     path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
     if path_annotations.exists():
         user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
@@ -162,7 +214,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
 def check_image_paths(table: pl.DataFrame) -> bool:
     missing_files = []
-    for org_path in tqdm(table["file_name"].to_list(), desc="Check image paths"):
+    org_paths = table[SampleField.FILE_PATH].to_list()
+    for org_path in track(org_paths, description="Check image paths"):
         org_path = Path(org_path)
         if not org_path.exists():
             missing_files.append(org_path)
@@ -218,3 +271,68 @@ def unnest_classification_tasks(table: pl.DataFrame, strict: bool = True) -> pl.
     table_out = table_out.with_columns([pl.col(c).list.first() for c in classification_columns])
     return table_out
+def update_class_indices(samples: pl.DataFrame, task: "TaskInfo") -> pl.DataFrame:
+    if task.class_names is None or len(task.class_names) == 0:
+        raise ValueError(f"Task '{task.name}' does not have defined class names to update class indices.")
+    objs = (
+        samples[task.primitive.column_name()]
+        .explode()
+        .struct.unnest()
+        .filter(pl.col(PrimitiveField.TASK_NAME) == task.name)
+    )
+    expected_class_names = set(objs[PrimitiveField.CLASS_NAME].unique())
+    missing_class_names = expected_class_names - set(task.class_names)
+    if len(missing_class_names) > 0:
+        raise ValueError(
+            f"Task '{task.name}' is missing class names: {missing_class_names}. Cannot update class indices."
+        )
+    name_2_idx_mapping = {name: idx for idx, name in enumerate(task.class_names)}
+    samples_updated = samples.with_columns(
+        pl.col(task.primitive.column_name())
+        .list.eval(
+            pl.element().struct.with_fields(
+                pl.when(pl.field(PrimitiveField.TASK_NAME) == task.name)
+                .then(pl.field(PrimitiveField.CLASS_NAME).replace_strict(name_2_idx_mapping, default=-1))
+                .otherwise(pl.field(PrimitiveField.CLASS_IDX))
+                .alias(PrimitiveField.CLASS_IDX)
+            )
+        )
+        .alias(task.primitive.column_name())
+    )
+    return samples_updated
+def add_sample_index(samples: pl.DataFrame) -> pl.DataFrame:
+    """
+    Adds a sample index column to the samples DataFrame.
+    Note: Unlike the built-in 'polars.DataFrame.with_row_count', this function
+    always guarantees 'pl.UInt64' type for the index column.
+    """
+    if SampleField.SAMPLE_INDEX in samples.columns:
+        samples = samples.drop(SampleField.SAMPLE_INDEX)
+    samples = samples.select(
+        pl.int_range(0, pl.count(), dtype=pl.UInt64).alias(SampleField.SAMPLE_INDEX),
+        pl.all(),
+    )
+    return samples
+def add_dataset_name_if_missing(table: pl.DataFrame, dataset_name: str) -> pl.DataFrame:
+    if SampleField.DATASET_NAME not in table.columns:
+        table = table.with_columns(pl.lit(dataset_name).alias(SampleField.DATASET_NAME))
+    else:
+        table = table.with_columns(
+            pl.when(pl.col(SampleField.DATASET_NAME).is_null())
+            .then(pl.lit(dataset_name))
+            .otherwise(pl.col(SampleField.DATASET_NAME))
+            .alias(SampleField.DATASET_NAME)
+        )
+    return table

hafnia/dataset/primitives/bbox.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import cv2
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.dataset.primitives.utils import (
@@ -17,26 +18,36 @@ from hafnia.dataset.primitives.utils import (
 class Bbox(Primitive):
     # Names should match names in FieldName
-    height: float  # Height of the bounding box as a fraction of the image height, e.g. 0.1 for 10% of the image height
-    width: float  # Width of the bounding box as a fraction of the image width, e.g. 0.1 for 10% of the image width
-    top_left_x: float  # X coordinate of top-left corner of Bbox as a fraction of the image width, e.g. 0.1 for 10% of the image width
-    top_left_y: float  # Y coordinate of top-left corner of Bbox as a fraction of the image height, e.g. 0.1 for 10% of the image height
-    class_name: Optional[str] = None  # Class name, e.g. "car"
-    class_idx: Optional[int] = None  # Class index, e.g. 0 for "car" if it is the first class
-    object_id: Optional[str] = None  # Unique identifier for the object, e.g. "12345123"
-    confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
-    task_name: str = ""  # Task name to support multiple Bbox tasks in the same dataset. "" defaults to "bboxes"
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
+    height: float = Field(
+        description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
+    )
+    width: float = Field(
+        description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
+    )
+    top_left_x: float = Field(
+        description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
+    )
+    top_left_y: float = Field(
+        description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
+    )
+    class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
+    class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
+    object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
+    confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
+    task_name: str = Field(
+        default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
+    )
+    meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
     @staticmethod
     def default_task_name() -> str:
-        return "bboxes"
+        return "object_detection"
     @staticmethod
     def column_name() -> str:
-        return "objects"
+        return "bboxes"
     def calculate_area(self) -> float:
         return self.height * self.width

hafnia/dataset/primitives/bitmask.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple
 import cv2
 import numpy as np
 import pycocotools.mask as coco_mask
+from pydantic import Field
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.dataset.primitives.utils import (
@@ -17,24 +18,30 @@ from hafnia.dataset.primitives.utils import (
 class Bitmask(Primitive):
     # Names should match names in FieldName
-    top: int  # Bitmask top coordinate in pixels
-    left: int  # Bitmask left coordinate in pixels
-    height: int  # Bitmask height of the bounding box in pixels
-    width: int  # Bitmask width of the bounding box in pixels
-    rleString: str  # Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left).
-    area: Optional[float] = None  # Area of the bitmask in pixels is calculated from the RLE string
-    class_name: Optional[str] = None  # This should match the string in 'FieldName.CLASS_NAME'
-    class_idx: Optional[int] = None  # This should match the string in 'FieldName.CLASS_IDX'
-    object_id: Optional[str] = None  # This should match the string in 'FieldName.OBJECT_ID'
-    confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
-    task_name: str = ""  # Task name to support multiple Bitmask tasks in the same dataset. "" defaults to "bitmask"
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
+    top: int = Field(description="Bitmask top coordinate in pixels ")
+    left: int = Field(description="Bitmask left coordinate in pixels")
+    height: int = Field(description="Bitmask height of the bounding box in pixels")
+    width: int = Field(description="Bitmask width of the bounding box in pixels")
+    rleString: str = Field(
+        description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
+    )
+    area: Optional[float] = Field(
+        default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
+    )
+    class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
+    class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
+    object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
+    confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
+    task_name: str = Field(
+        default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
+    )
+    meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
     @staticmethod
     def default_task_name() -> str:
-        return "bitmask"
+        return "mask_detection"
     @staticmethod
     def column_name() -> str:

hafnia/dataset/primitives/classification.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Tuple
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
@@ -8,18 +9,25 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
 class Classification(Primitive):
     # Names should match names in FieldName
-    class_name: Optional[str] = None  # Class name, e.g. "car"
-    class_idx: Optional[int] = None  # Class index, e.g. 0 for "car" if it is the first class
-    object_id: Optional[str] = None  # Unique identifier for the object, e.g. "12345123"
-    confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
+    class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
+    class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
+    object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
+    confidence: float = Field(
+        default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
+    )
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
-    task_name: str = ""  # To support multiple Classification tasks in the same dataset. "" defaults to "classification"
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
+    task_name: str = Field(
+        default="",
+        description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
+    )
+    meta: Optional[Dict[str, Any]] = Field(
+        default=None, description="This can be used to store additional information about the classification"
+    )
     @staticmethod
     def default_task_name() -> str:
-        return "classification"
+        return "image_classification"
     @staticmethod
     def column_name() -> str:

hafnia/dataset/primitives/point.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from typing import Any, Tuple
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from hafnia.dataset.primitives.utils import clip
 class Point(BaseModel):
-    x: float
-    y: float
+    x: float = Field(
+        description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
+    )
+    y: float = Field(
+        description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
+    )
     def to_pixel_coordinates(
         self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True

hafnia/dataset/primitives/polygon.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.bitmask import Bitmask
 from hafnia.dataset.primitives.point import Point
@@ -11,15 +12,19 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
 class Polygon(Primitive):
     # Names should match names in FieldName
-    points: List[Point]
-    class_name: Optional[str] = None  # This should match the string in 'FieldName.CLASS_NAME'
-    class_idx: Optional[int] = None  # This should match the string in 'FieldName.CLASS_IDX'
-    object_id: Optional[str] = None  # This should match the string in 'FieldName.OBJECT_ID'
-    confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
-    task_name: str = ""  # Task name to support multiple Polygon tasks in the same dataset. "" defaults to "polygon"
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
+    points: List[Point] = Field(description="List of points defining the polygon")
+    class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
+    class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
+    object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
+    confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
+    task_name: str = Field(
+        default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
+    )
+    meta: Optional[Dict[str, Any]] = Field(
+        default=None, description="This can be used to store additional information about the polygon"
+    )
     @staticmethod
     def from_list_of_points(
@@ -33,7 +38,7 @@ class Polygon(Primitive):
     @staticmethod
     def default_task_name() -> str:
-        return "polygon"
+        return "polygon_detection"
     @staticmethod
     def column_name() -> str:

hafnia/dataset/primitives/primitive.py CHANGED Viewed

@@ -22,7 +22,7 @@ class Primitive(BaseModel, metaclass=ABCMeta):
     def column_name() -> str:
         """
         Name of field used in hugging face datasets for storing annotations
-        E.g. "objects" for Bbox.
+        E.g. "bboxes" for Bbox.
         """
         pass

hafnia/dataset/primitives/segmentation.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
+from pydantic import Field
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.dataset.primitives.utils import get_class_name
@@ -9,23 +10,25 @@ from hafnia.visualizations.colors import get_n_colors
 class Segmentation(Primitive):
-    # mask: np.ndarray
-    class_names: Optional[List[str]] = None  # This should match the string in 'FieldName.CLASS_NAME'
-    ground_truth: bool = True  # Whether this is ground truth or a prediction
+    # WARNING: Segmentation masks have not been fully implemented yet
+    class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
+    ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
-    # confidence: Optional[float] = None  # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
-    task_name: str = (
-        ""  # Task name to support multiple Segmentation tasks in the same dataset. "" defaults to "segmentation"
+    task_name: str = Field(
+        default="",
+        description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
+    )
+    meta: Optional[Dict[str, Any]] = Field(
+        default=None, description="This can be used to store additional information about the segmentation"
     )
-    meta: Optional[Dict[str, Any]] = None  # This can be used to store additional information about the bitmask
     @staticmethod
     def default_task_name() -> str:
-        return "segmentation"
+        return "semantic_segmentation"
     @staticmethod
     def column_name() -> str:
-        return "segmentation"
+        return "segmentations"
     def calculate_area(self) -> float:
         raise NotImplementedError()

hafnia/experiment/hafnia_logger.py CHANGED Viewed

@@ -12,8 +12,6 @@ import pyarrow as pa
 import pyarrow.parquet as pq
 from pydantic import BaseModel, field_validator
-from hafnia.data.factory import load_dataset
-from hafnia.dataset.hafnia_dataset import HafniaDataset
 from hafnia.log import sys_logger, user_logger
 from hafnia.utils import is_hafnia_cloud_job, now_as_str
@@ -136,13 +134,6 @@ class HafniaLogger:
         except Exception as e:
             user_logger.error(f"Failed to initialize MLflow: {e}")
-    def load_dataset(self, dataset_name: str) -> HafniaDataset:
-        """
-        Load a dataset from the specified path.
-        """
-        self.dataset_name = dataset_name
-        return load_dataset(dataset_name)
     def path_local_experiment(self) -> Path:
         """Get the path for local experiment."""
         if is_hafnia_cloud_job():

hafnia/platform/dataset_recipe.py CHANGED Viewed

@@ -11,12 +11,17 @@ from hafnia.utils import pretty_print_list_as_table, timed
 @timed("Get or create dataset recipe")
 def get_or_create_dataset_recipe(
-    recipe: dict, endpoint: str, api_key: str, name: Optional[str] = None
+    recipe: dict,
+    endpoint: str,
+    api_key: str,
+    name: Optional[str] = None,
+    overwrite: bool = False,
 ) -> Optional[Dict]:
     headers = {"Authorization": api_key}
-    data = {"template": {"body": recipe}}
+    data = {"template": {"body": recipe}, "overwrite": overwrite}
     if name is not None:
         data["name"] = name  # type: ignore[assignment]
     response = http.post(endpoint, headers=headers, data=data)
     return response

hafnia/platform/datasets.py CHANGED Viewed

@@ -9,11 +9,11 @@ from typing import Any, Dict, List, Optional
 import rich
 from rich import print as rprint
-from tqdm import tqdm
+from rich.progress import track
 from cli.config import Config
 from hafnia import http, utils
-from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
+from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
 from hafnia.dataset.dataset_recipe.dataset_recipe import (
     DatasetRecipe,
     get_dataset_path_from_recipe,
@@ -120,15 +120,11 @@ def download_dataset_from_access_endpoint(
         return
     dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
     try:
-        fast_copy_files_s3(
-            src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
-            dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
-            append_envs=envs,
-            description="Downloading images",
-        )
+        dataset = dataset.download_files_aws(path_dataset, aws_credentials=resource_credentials, force_redownload=True)
     except ValueError as e:
         user_logger.error(f"Failed to download images: {e}")
         return
+    dataset.write_annotations(path_folder=path_dataset)  # Overwrite annotations as files have been re-downloaded
 def fast_copy_files_s3(
@@ -196,7 +192,7 @@ def execute_s5cmd_commands(
         error_lines = []
         lines = []
-        for line in tqdm(process.stdout, total=len(commands), desc=description):
+        for line in track(process.stdout, total=len(commands), description=description):
             if "ERROR" in line or "error" in line:
                 error_lines.append(line.strip())
             lines.append(line.strip())

hafnia 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

hafnia 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl