PyPI - hafnia - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

hafnia 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cli/__main__.py +3 -1
cli/config.py +43 -3
cli/keychain.py +88 -0
cli/profile_cmds.py +5 -2
hafnia/__init__.py +1 -1
hafnia/dataset/dataset_helpers.py +9 -2
hafnia/dataset/dataset_names.py +130 -16
hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
hafnia/dataset/dataset_upload_helper.py +83 -22
hafnia/dataset/format_conversions/format_image_classification_folder.py +110 -0
hafnia/dataset/format_conversions/format_yolo.py +164 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +287 -0
hafnia/dataset/hafnia_dataset.py +396 -96
hafnia/dataset/operations/dataset_stats.py +84 -73
hafnia/dataset/operations/dataset_transformations.py +116 -47
hafnia/dataset/operations/table_transformations.py +135 -17
hafnia/dataset/primitives/bbox.py +25 -14
hafnia/dataset/primitives/bitmask.py +22 -15
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +15 -10
hafnia/dataset/primitives/primitive.py +1 -1
hafnia/dataset/primitives/segmentation.py +12 -9
hafnia/experiment/hafnia_logger.py +0 -9
hafnia/platform/dataset_recipe.py +7 -2
hafnia/platform/datasets.py +5 -9
hafnia/platform/download.py +24 -90
hafnia/torch_helpers.py +12 -12
hafnia/utils.py +17 -0
hafnia/visualizations/image_visualizations.py +3 -1
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/METADATA +11 -9
hafnia-0.4.1.dist-info/RECORD +57 -0
hafnia-0.3.0.dist-info/RECORD +0 -53
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/WHEEL +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/entry_points.txt +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/operations/dataset_stats.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Dict, Optional, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
 import polars as pl
 import rich
 from rich import print as rprint
+from rich.progress import track
 from rich.table import Table
-from tqdm import tqdm
-from hafnia.dataset.dataset_names import ColumnName, FieldName, SplitName
+from hafnia.dataset.dataset_names import PrimitiveField, SampleField, SplitName
 from hafnia.dataset.operations.table_transformations import create_primitive_table
 from hafnia.dataset.primitives import PRIMITIVE_TYPES
 from hafnia.log import user_logger
@@ -18,14 +18,14 @@ if TYPE_CHECKING:  # Using 'TYPE_CHECKING' to avoid circular imports during type
     from hafnia.dataset.primitives.primitive import Primitive
-def split_counts(dataset: HafniaDataset) -> Dict[str, int]:
+def calculate_split_counts(dataset: HafniaDataset) -> Dict[str, int]:
     """
     Returns a dictionary with the counts of samples in each split of the dataset.
     """
-    return dict(dataset.samples[ColumnName.SPLIT].value_counts().iter_rows())
+    return dict(dataset.samples[SampleField.SPLIT].value_counts().iter_rows())
-def class_counts_for_task(
+def calculate_task_class_counts(
     dataset: HafniaDataset,
     primitive: Optional[Type[Primitive]] = None,
     task_name: Optional[str] = None,
@@ -53,7 +53,7 @@ def class_counts_for_task(
         dataset.samples[task.primitive.column_name()]
         .explode()
         .struct.unnest()
-        .filter(pl.col(FieldName.TASK_NAME) == task.name)[FieldName.CLASS_NAME]
+        .filter(pl.col(PrimitiveField.TASK_NAME) == task.name)[PrimitiveField.CLASS_NAME]
         .value_counts()
     )
@@ -65,7 +65,7 @@ def class_counts_for_task(
     return class_counts
-def class_counts_all(dataset: HafniaDataset) -> Dict[str, int]:
+def calculate_class_counts(dataset: HafniaDataset) -> List[Dict[str, Any]]:
     """
     Get class counts for all tasks in the dataset.
     The counts are returned as a dictionary where keys are in the format
@@ -74,25 +74,59 @@ def class_counts_all(dataset: HafniaDataset) -> Dict[str, int]:
     Example:
     >>> counts = dataset.class_counts_all()
     >>> print(counts)
-    {
-        objects/bboxes/car: 500
-        objects/bboxes/person: 0
-        classifications/weather/sunny: 300
-        classifications/weather/rainy: 0
-        ...
-    }
+    [
+        {'Primitive': 'Bbox', 'Task Name': 'detection', 'Class Name': 'car', 'Count': 500},
+        {'Primitive': 'Bbox', 'Task Name': 'detection', 'Class Name': 'bus', 'Count': 100},
+        {'Primitive': 'Classification', 'Task Name': 'scene', 'Class Name': 'indoor', 'Count': 300},
+        {'Primitive': 'Classification', 'Task Name': 'scene', 'Class Name': 'outdoor', 'Count': 700},
+    ]
     """
-    class_counts = {}
+    count_info = []
     for task in dataset.info.tasks:
-        if task.class_names is None:
-            raise ValueError(f"Task '{task.name}' does not have class names defined.")
-        class_counts_task = dataset.class_counts_for_task(primitive=task.primitive, task_name=task.name)
+        class_name_counts = dataset.calculate_task_class_counts(task_name=task.name)
+        for name, counts in class_name_counts.items():
+            count_info.append(
+                {
+                    "Primitive": task.primitive.__name__,
+                    "Task Name": task.name,
+                    "Class Name": name,
+                    "Count": counts,
+                }
+            )
+    return count_info
-        for class_idx, (class_name, count) in enumerate(class_counts_task.items()):
-            count_name = f"{task.primitive.__name__}/{task.name}/{class_name}"
-            class_counts[count_name] = count
-    return class_counts
+def calculate_primitive_counts(dataset: HafniaDataset) -> Dict[str, int]:
+    annotation_counts = {}
+    for task in dataset.info.tasks:
+        objects = dataset.create_primitive_table(task.primitive, task_name=task.name)
+        name = task.primitive.__name__
+        if task.name != task.primitive.default_task_name():
+            name = f"{name}.{task.name}"
+        annotation_counts[name] = len(objects)
+    return annotation_counts
+def calculate_split_counts_extended(dataset: HafniaDataset) -> List[Dict[str, Any]]:
+    splits_sets = {
+        "All": SplitName.valid_splits(),
+        "Train": [SplitName.TRAIN],
+        "Validation": [SplitName.VAL],
+        "Test": [SplitName.TEST],
+    }
+    rows = []
+    for split_name, splits in splits_sets.items():
+        dataset_split = dataset.create_split_dataset(splits)
+        table = dataset_split.samples
+        row: Dict[str, Any] = {}
+        row["Split"] = split_name
+        row["Samples "] = str(len(table))
+        primitive_counts = calculate_primitive_counts(dataset_split)
+        row.update(primitive_counts)
+        rows.append(row)
+    return rows
 def print_stats(dataset: HafniaDataset) -> None:
@@ -118,10 +152,13 @@ def print_class_distribution(dataset: HafniaDataset) -> None:
     for task in dataset.info.tasks:
         if task.class_names is None:
             raise ValueError(f"Task '{task.name}' does not have class names defined.")
-        class_counts = dataset.class_counts_for_task(primitive=task.primitive, task_name=task.name)
+        class_counts = dataset.calculate_task_class_counts(primitive=task.primitive, task_name=task.name)
         # Print class distribution
-        rich_table = Table(title=f"Class Count for '{task.primitive.__name__}/{task.name}'", show_lines=False)
+        rich_table = Table(
+            title=f"Class Count for '{task.primitive.__name__}/{task.name}'",
+            show_lines=False,
+        )
         rich_table.add_column("Class Name", style="cyan")
         rich_table.add_column("Class Idx", style="cyan")
         rich_table.add_column("Count", justify="right")
@@ -136,32 +173,7 @@ def print_sample_and_task_counts(dataset: HafniaDataset) -> None:
     Prints a table with sample counts and task counts for each primitive type
     in total and for each split (train, val, test).
     """
-    from hafnia.dataset.operations.table_transformations import create_primitive_table
-    from hafnia.dataset.primitives import PRIMITIVE_TYPES
-    splits_sets = {
-        "All": SplitName.valid_splits(),
-        "Train": [SplitName.TRAIN],
-        "Validation": [SplitName.VAL],
-        "Test": [SplitName.TEST],
-    }
-    rows = []
-    for split_name, splits in splits_sets.items():
-        dataset_split = dataset.create_split_dataset(splits)
-        table = dataset_split.samples
-        row = {}
-        row["Split"] = split_name
-        row["Sample "] = str(len(table))
-        for PrimitiveType in PRIMITIVE_TYPES:
-            column_name = PrimitiveType.column_name()
-            objects_df = create_primitive_table(table, PrimitiveType=PrimitiveType, keep_sample_data=False)
-            if objects_df is None:
-                continue
-            for (task_name,), object_group in objects_df.group_by(FieldName.TASK_NAME):
-                count = len(object_group[FieldName.CLASS_NAME])
-                row[f"{PrimitiveType.__name__}\n{task_name}"] = str(count)
-        rows.append(row)
+    rows = calculate_split_counts_extended(dataset)
     rich_table = Table(title="Dataset Statistics", show_lines=True, box=rich.box.SIMPLE)
     for i_row, row in enumerate(rows):
         if i_row == 0:
@@ -171,7 +183,7 @@ def print_sample_and_task_counts(dataset: HafniaDataset) -> None:
     rprint(rich_table)
-def check_dataset(dataset: HafniaDataset):
+def check_dataset(dataset: HafniaDataset, check_splits: bool = True):
     """
     Performs various checks on the dataset to ensure its integrity and consistency.
     Raises errors if any issues are found.
@@ -179,24 +191,23 @@ def check_dataset(dataset: HafniaDataset):
     from hafnia.dataset.hafnia_dataset import Sample
     user_logger.info("Checking Hafnia dataset...")
-    assert isinstance(dataset.info.version, str) and len(dataset.info.version) > 0
     assert isinstance(dataset.info.dataset_name, str) and len(dataset.info.dataset_name) > 0
-    sample_dataset = dataset.create_sample_dataset()
-    if len(sample_dataset) == 0:
-        raise ValueError("The dataset does not include a sample dataset")
+    if check_splits:
+        sample_dataset = dataset.create_sample_dataset()
+        if len(sample_dataset) == 0:
+            raise ValueError("The dataset does not include a sample dataset")
+        actual_splits = dataset.samples.select(pl.col(SampleField.SPLIT)).unique().to_series().to_list()
+        required_splits = SplitName.valid_splits()
-    actual_splits = dataset.samples.select(pl.col(ColumnName.SPLIT)).unique().to_series().to_list()
-    expected_splits = SplitName.valid_splits()
-    if set(actual_splits) != set(expected_splits):
-        raise ValueError(f"Expected all splits '{expected_splits}' in dataset, but got '{actual_splits}'. ")
+        if not set(required_splits).issubset(set(actual_splits)):
+            raise ValueError(f"Expected all splits '{required_splits}' in dataset, but got '{actual_splits}'. ")
     dataset.check_dataset_tasks()
     expected_tasks = dataset.info.tasks
-    distribution = dataset.info.distributions or []
-    distribution_names = [task.name for task in distribution]
-    # Check that tasks found in the 'dataset.table' matches the tasks defined in 'dataset.info.tasks'
+    # Check that tasks found in the 'dataset.samples' matches the tasks defined in 'dataset.info.tasks'
     for PrimitiveType in PRIMITIVE_TYPES:
         column_name = PrimitiveType.column_name()
         if column_name not in dataset.samples.columns:
@@ -204,18 +215,18 @@ def check_dataset(dataset: HafniaDataset):
         objects_df = create_primitive_table(dataset.samples, PrimitiveType=PrimitiveType, keep_sample_data=False)
         if objects_df is None:
             continue
-        for (task_name,), object_group in objects_df.group_by(FieldName.TASK_NAME):
+        for (task_name,), object_group in objects_df.group_by(PrimitiveField.TASK_NAME):
             has_task = any([t for t in expected_tasks if t.name == task_name and t.primitive == PrimitiveType])
-            if has_task or (task_name in distribution_names):
+            if has_task:
                 continue
-            class_names = object_group[FieldName.CLASS_NAME].unique().to_list()
+            class_names = object_group[PrimitiveField.CLASS_NAME].unique().to_list()
             raise ValueError(
                 f"Task name '{task_name}' for the '{PrimitiveType.__name__}' primitive is missing in "
-                f"'dataset.info.tasks' for dataset '{task_name}'. Missing task has the following "
+                f"'dataset.info.tasks' for dataset '{dataset.info.dataset_name}'. Missing task has the following "
                 f"classes: {class_names}. "
             )
-    for sample_dict in tqdm(dataset, desc="Checking samples in dataset"):
+    for sample_dict in track(dataset, description="Checking samples in dataset"):
         sample = Sample(**sample_dict)  # noqa: F841
@@ -238,7 +249,7 @@ def check_dataset_tasks(dataset: HafniaDataset):
         if len(dataset) > 0:  # Check only performed for non-empty datasets
             primitive_table = (
-                primitive_column.explode().struct.unnest().filter(pl.col(FieldName.TASK_NAME) == task.name)
+                primitive_column.explode().struct.unnest().filter(pl.col(PrimitiveField.TASK_NAME) == task.name)
             )
             if primitive_table.is_empty():
                 raise ValueError(
@@ -246,7 +257,7 @@ def check_dataset_tasks(dataset: HafniaDataset):
                     + f"the column '{column_name}' has no {task.name=} objects. Please check the dataset."
                 )
-            actual_classes = set(primitive_table[FieldName.CLASS_NAME].unique().to_list())
+            actual_classes = set(primitive_table[PrimitiveField.CLASS_NAME].unique().to_list())
             if task.class_names is None:
                 raise ValueError(
                     msg_something_wrong
@@ -261,12 +272,12 @@ def check_dataset_tasks(dataset: HafniaDataset):
                     f"to be a subset of the defined classes\n\t{actual_classes=} \n\t{defined_classes=}."
                 )
             # Check class_indices
-            mapped_indices = primitive_table[FieldName.CLASS_NAME].map_elements(
+            mapped_indices = primitive_table[PrimitiveField.CLASS_NAME].map_elements(
                 lambda x: task.class_names.index(x), return_dtype=pl.Int64
             )
-            table_indices = primitive_table[FieldName.CLASS_IDX]
+            table_indices = primitive_table[PrimitiveField.CLASS_IDX]
             error_msg = msg_something_wrong + (
-                f"class indices in '{FieldName.CLASS_IDX}' column does not match classes ordering in 'task.class_names'"
+                f"class indices in '{PrimitiveField.CLASS_IDX}' column does not match classes ordering in 'task.class_names'"
             )
             assert mapped_indices.equals(table_indices), error_msg

hafnia/dataset/operations/dataset_transformations.py CHANGED Viewed

@@ -31,25 +31,32 @@ that the signatures match.
 import json
 import re
+import shutil
 import textwrap
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
 import cv2
 import more_itertools
 import numpy as np
 import polars as pl
-from PIL import Image
-from tqdm import tqdm
+from rich.progress import track
 from hafnia.dataset import dataset_helpers
-from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, FieldName
+from hafnia.dataset.dataset_names import (
+    OPS_REMOVE_CLASS,
+    PrimitiveField,
+    SampleField,
+    StorageFormat,
+)
+from hafnia.dataset.operations.table_transformations import update_class_indices
 from hafnia.dataset.primitives import get_primitive_type_from_string
 from hafnia.dataset.primitives.primitive import Primitive
+from hafnia.log import user_logger
 from hafnia.utils import remove_duplicates_preserve_order
 if TYPE_CHECKING:  # Using 'TYPE_CHECKING' to avoid circular imports during type checking
-    from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
+    from hafnia.dataset.hafnia_dataset import HafniaDataset, Sample, TaskInfo
 ### Image transformations ###
@@ -57,7 +64,7 @@ class AnonymizeByPixelation:
     def __init__(self, resize_factor: float = 0.10):
         self.resize_factor = resize_factor
-    def __call__(self, frame: np.ndarray) -> np.ndarray:
+    def __call__(self, frame: np.ndarray, sample: "Sample") -> np.ndarray:
         org_size = frame.shape[:2]
         frame = cv2.resize(frame, (0, 0), fx=self.resize_factor, fy=self.resize_factor)
         frame = cv2.resize(frame, org_size[::-1], interpolation=cv2.INTER_NEAREST)
@@ -66,30 +73,100 @@ class AnonymizeByPixelation:
 def transform_images(
     dataset: "HafniaDataset",
-    transform: Callable[[np.ndarray], np.ndarray],
+    transform: Callable[[np.ndarray, "Sample"], np.ndarray],
     path_output: Path,
+    description: str = "Transform images",
 ) -> "HafniaDataset":
+    from hafnia.dataset.hafnia_dataset import Sample
     new_paths = []
     path_image_folder = path_output / "data"
     path_image_folder.mkdir(parents=True, exist_ok=True)
-    for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
-        org_path = Path(org_path)
-        if not org_path.exists():
-            raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
-        image = np.array(Image.open(org_path))
-        image_transformed = transform(image)
+    for sample_dict in track(dataset, description=description):
+        sample = Sample(**sample_dict)
+        image = sample.read_image()
+        image_transformed = transform(image, sample)
         new_path = dataset_helpers.save_image_with_hash_name(image_transformed, path_image_folder)
         if not new_path.exists():
             raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
         new_paths.append(str(new_path))
-    table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
+    table = dataset.samples.with_columns(pl.Series(new_paths).alias(SampleField.FILE_PATH))
     return dataset.update_samples(table)
+def convert_to_image_storage_format(
+    dataset: "HafniaDataset",
+    path_output_folder: Path,
+    reextract_frames: bool,
+    image_format: str = "png",
+    transform: Optional[Callable[[np.ndarray, "Sample"], np.ndarray]] = None,
+) -> "HafniaDataset":
+    """
+    Convert a video-based dataset ("storage_format" == "video", FieldName.STORAGE_FORMAT == StorageFormat.VIDEO)
+    to an image-based dataset by extracting frames.
+    """
+    from hafnia.dataset.hafnia_dataset import HafniaDataset, Sample
+    path_images = path_output_folder / "data"
+    path_images.mkdir(parents=True, exist_ok=True)
+    # Only video format dataset samples are processed
+    video_based_samples = dataset.samples.filter(pl.col(SampleField.STORAGE_FORMAT) == StorageFormat.VIDEO)
+    if video_based_samples.is_empty():
+        user_logger.info("Dataset has no video-based samples. Returning dataset unchanged.")
+        return dataset
+    update_list = []
+    for (path_video,), video_samples in video_based_samples.group_by(SampleField.FILE_PATH):
+        assert Path(path_video).exists(), (
+            f"'{path_video}' not found. We expect the video to be downloaded to '{path_output_folder}'"
+        )
+        video = cv2.VideoCapture(str(path_video))
+        video_samples = video_samples.sort(SampleField.COLLECTION_INDEX)
+        for sample_dict in track(
+            video_samples.iter_rows(named=True),
+            total=video_samples.height,
+            description=f"Extracting frames from '{Path(path_video).name}'",
+        ):
+            frame_number = sample_dict[SampleField.COLLECTION_INDEX]
+            image_name = f"{Path(path_video).stem}_F{frame_number:06d}.{image_format}"
+            path_image = path_images / image_name
+            update_list.append(
+                {
+                    SampleField.SAMPLE_INDEX: sample_dict[SampleField.SAMPLE_INDEX],
+                    SampleField.COLLECTION_ID: sample_dict[SampleField.COLLECTION_ID],
+                    SampleField.COLLECTION_INDEX: frame_number,
+                    SampleField.FILE_PATH: path_image.as_posix(),
+                    SampleField.STORAGE_FORMAT: StorageFormat.IMAGE,
+                }
+            )
+            if reextract_frames:
+                shutil.rmtree(path_image, ignore_errors=True)
+            if path_image.exists():
+                continue
+            video.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+            ret, frame_org = video.read()
+            if not ret:
+                raise RuntimeError(f"Could not read frame {frame_number} from video '{path_video}'")
+            if transform is not None:
+                frame_org = transform(frame_org, Sample(**sample_dict))
+            cv2.imwrite(str(path_image), frame_org)
+    df_updates = pl.DataFrame(update_list)
+    samples_as_images = dataset.samples.update(df_updates, on=[SampleField.COLLECTION_ID, SampleField.COLLECTION_INDEX])
+    hafnia_dataset = HafniaDataset(samples=samples_as_images, info=dataset.info)
+    return hafnia_dataset
 def get_task_info_from_task_name_and_primitive(
     tasks: List["TaskInfo"],
     task_name: Optional[str] = None,
@@ -156,13 +233,16 @@ def get_task_info_from_task_name_and_primitive(
 def class_mapper(
     dataset: "HafniaDataset",
-    class_mapping: Dict[str, str],
+    class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
     method: str = "strict",
     primitive: Optional[Type[Primitive]] = None,
     task_name: Optional[str] = None,
 ) -> "HafniaDataset":
     from hafnia.dataset.hafnia_dataset import HafniaDataset
+    if isinstance(class_mapping, list):
+        class_mapping = dict(class_mapping)
     allowed_methods = ("strict", "remove_undefined", "keep_undefined")
     if method not in allowed_methods:
         raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
@@ -170,7 +250,7 @@ def class_mapper(
     task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=primitive)
     current_names = task.class_names or []
-    # Expand wildcard mappings
+    # Expand wildcard mappings e.g. {"Vehicle.*": "Vehicle"} to {"Vehicle.Car": "Vehicle", "Vehicle.Bus": "Vehicle"}
     class_mapping = expand_class_mapping(class_mapping, current_names)
     non_existing_mapping_names = set(class_mapping) - set(current_names)
@@ -213,31 +293,16 @@ def class_mapper(
     if OPS_REMOVE_CLASS in new_class_names:
         # Move __REMOVE__ to the end of the list if it exists
         new_class_names.append(new_class_names.pop(new_class_names.index(OPS_REMOVE_CLASS)))
-    name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
     samples = dataset.samples
     samples_updated = samples.with_columns(
         pl.col(task.primitive.column_name())
         .list.eval(
             pl.element().struct.with_fields(
-                pl.when(pl.field(FieldName.TASK_NAME) == task.name)
-                .then(pl.field(FieldName.CLASS_NAME).replace_strict(class_mapping))
-                .otherwise(pl.field(FieldName.CLASS_NAME))
-                .alias(FieldName.CLASS_NAME)
-            )
-        )
-        .alias(task.primitive.column_name())
-    )
-    # Update class indices too
-    samples_updated = samples_updated.with_columns(
-        pl.col(task.primitive.column_name())
-        .list.eval(
-            pl.element().struct.with_fields(
-                pl.when(pl.field(FieldName.TASK_NAME) == task.name)
-                .then(pl.field(FieldName.CLASS_NAME).replace_strict(name_2_idx_mapping))
-                .otherwise(pl.field(FieldName.CLASS_IDX))
-                .alias(FieldName.CLASS_IDX)
+                pl.when(pl.field(PrimitiveField.TASK_NAME) == task.name)
+                .then(pl.field(PrimitiveField.CLASS_NAME).replace_strict(class_mapping, default="Missing"))
+                .otherwise(pl.field(PrimitiveField.CLASS_NAME))
+                .alias(PrimitiveField.CLASS_NAME)
             )
         )
         .alias(task.primitive.column_name())
@@ -246,7 +311,7 @@ def class_mapper(
     if OPS_REMOVE_CLASS in new_class_names:  # Remove class_names that are mapped to REMOVE_CLASS
         samples_updated = samples_updated.with_columns(
             pl.col(task.primitive.column_name())
-            .list.filter(pl.element().struct.field(FieldName.CLASS_NAME) != OPS_REMOVE_CLASS)
+            .list.filter(pl.element().struct.field(PrimitiveField.CLASS_NAME) != OPS_REMOVE_CLASS)
             .alias(task.primitive.column_name())
         )
@@ -255,6 +320,10 @@ def class_mapper(
     new_task = task.model_copy(deep=True)
     new_task.class_names = new_class_names
     dataset_info = dataset.info.replace_task(old_task=task, new_task=new_task)
+    # Update class indices to match new class names
+    samples_updated = update_class_indices(samples_updated, new_task)
     return HafniaDataset(info=dataset_info, samples=samples_updated)
@@ -313,7 +382,7 @@ def rename_task(
         pl.col(old_task.primitive.column_name())
         .list.eval(
             pl.element().struct.with_fields(
-                pl.field(FieldName.TASK_NAME).replace(old_task.name, new_task.name).alias(FieldName.TASK_NAME)
+                pl.field(PrimitiveField.TASK_NAME).replace(old_task.name, new_task.name).alias(PrimitiveField.TASK_NAME)
             )
         )
         .alias(new_task.primitive.column_name())
@@ -339,8 +408,8 @@ def select_samples_by_class_name(
     samples = dataset.samples.filter(
         pl.col(task.primitive.column_name())
         .list.eval(
-            pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names)
-            & (pl.element().struct.field(FieldName.TASK_NAME) == task.name)
+            pl.element().struct.field(PrimitiveField.CLASS_NAME).is_in(class_names)
+            & (pl.element().struct.field(PrimitiveField.TASK_NAME) == task.name)
         )
         .list.any()
     )
@@ -354,14 +423,14 @@ def _validate_inputs_select_samples_by_class_name(
     name: Union[List[str], str],
     task_name: Optional[str] = None,
     primitive: Optional[Type[Primitive]] = None,
-) -> Tuple["TaskInfo", Set[str]]:
+) -> Tuple["TaskInfo", List[str]]:
     if isinstance(name, str):
         name = [name]
-    names = set(name)
+    names = list(name)
     # Check that specified names are available in at least one of the tasks
     available_names_across_tasks = set(more_itertools.flatten([t.class_names for t in dataset.info.tasks]))
-    missing_class_names_across_tasks = names - available_names_across_tasks
+    missing_class_names_across_tasks = set(names) - available_names_across_tasks
     if len(missing_class_names_across_tasks) > 0:
         raise ValueError(
             f"The specified names {list(names)} have not been found in any of the tasks. "
@@ -370,15 +439,15 @@ def _validate_inputs_select_samples_by_class_name(
     # Auto infer task if task_name and primitive are not provided
     if task_name is None and primitive is None:
-        tasks_with_names = [t for t in dataset.info.tasks if names.issubset(t.class_names or [])]
+        tasks_with_names = [t for t in dataset.info.tasks if set(names).issubset(t.class_names or [])]
         if len(tasks_with_names) == 0:
             raise ValueError(
-                f"The specified names {list(names)} have not been found in any of the tasks. "
+                f"The specified names {names} have not been found in any of the tasks. "
                 f"Available class names: {available_names_across_tasks}"
             )
         if len(tasks_with_names) > 1:
             raise ValueError(
-                f"Found multiple tasks containing the specified names {list(names)}. "
+                f"Found multiple tasks containing the specified names {names}. "
                 f"Specify either 'task_name' or 'primitive' to only select from one task. "
                 f"Tasks containing all provided names: {[t.name for t in tasks_with_names]}"
             )
@@ -393,7 +462,7 @@ def _validate_inputs_select_samples_by_class_name(
         )
     task_class_names = set(task.class_names or [])
-    missing_class_names = names - task_class_names
+    missing_class_names = set(names) - task_class_names
     if len(missing_class_names) > 0:
         raise ValueError(
             f"The specified names {list(missing_class_names)} have not been found for the '{task.name}' task. "

hafnia 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

hafnia 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl