PyPI - hafnia - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

hafnia 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

cli/__main__.py +3 -1
cli/config.py +43 -3
cli/keychain.py +88 -0
cli/profile_cmds.py +5 -2
hafnia/__init__.py +1 -1
hafnia/dataset/dataset_helpers.py +9 -2
hafnia/dataset/dataset_names.py +2 -1
hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
hafnia/dataset/dataset_upload_helper.py +60 -4
hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
hafnia/dataset/hafnia_dataset.py +176 -50
hafnia/dataset/operations/dataset_stats.py +2 -3
hafnia/dataset/operations/dataset_transformations.py +19 -15
hafnia/dataset/operations/table_transformations.py +4 -3
hafnia/dataset/primitives/bbox.py +25 -12
hafnia/dataset/primitives/bitmask.py +26 -14
hafnia/dataset/primitives/classification.py +16 -8
hafnia/dataset/primitives/point.py +7 -3
hafnia/dataset/primitives/polygon.py +16 -9
hafnia/dataset/primitives/segmentation.py +10 -7
hafnia/experiment/hafnia_logger.py +0 -9
hafnia/platform/dataset_recipe.py +7 -2
hafnia/platform/datasets.py +3 -3
hafnia/platform/download.py +23 -18
hafnia/utils.py +17 -0
hafnia/visualizations/image_visualizations.py +1 -1
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/METADATA +8 -6
hafnia-0.4.0.dist-info/RECORD +56 -0
hafnia-0.3.0.dist-info/RECORD +0 -53
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
{hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0

hafnia/dataset/format_conversions/torchvision_datasets.py ADDED Viewed

@@ -0,0 +1,281 @@
+import inspect
+import os
+import shutil
+import tempfile
+import textwrap
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Tuple
+from rich.progress import track
+from torchvision import datasets as tv_datasets
+from torchvision.datasets import VisionDataset
+from torchvision.datasets.utils import download_and_extract_archive, extract_archive
+from hafnia import utils
+from hafnia.dataset.dataset_helpers import save_pil_image_with_hash_name
+from hafnia.dataset.dataset_names import SplitName
+from hafnia.dataset.format_conversions.image_classification_from_directory import (
+    import_image_classification_directory_tree,
+)
+from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
+from hafnia.dataset.primitives import Classification
+def torchvision_to_hafnia_converters() -> Dict[str, Callable]:
+    return {
+        "mnist": mnist_as_hafnia_dataset,
+        "cifar10": cifar10_as_hafnia_dataset,
+        "cifar100": cifar100_as_hafnia_dataset,
+        "caltech-101": caltech_101_as_hafnia_dataset,
+        "caltech-256": caltech_256_as_hafnia_dataset,
+    }
+def mnist_as_hafnia_dataset(force_redownload=False, n_samples: Optional[int] = None) -> HafniaDataset:
+    samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
+        dataset_loader=tv_datasets.MNIST,
+        force_redownload=force_redownload,
+        n_samples=n_samples,
+    )
+    dataset_info = DatasetInfo(
+        dataset_name="mnist",
+        version="1.1.0",
+        tasks=tasks,
+        reference_bibtex=textwrap.dedent("""\
+            @article{lecun2010mnist,
+              title={MNIST handwritten digit database},
+              author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
+              journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
+              volume={2},
+              year={2010}
+            }"""),
+        reference_paper_url=None,
+        reference_dataset_page="http://yann.lecun.com/exdb/mnist",
+    )
+    return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
+def cifar10_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
+    return cifar_as_hafnia_dataset(dataset_name="cifar10", force_redownload=force_redownload, n_samples=n_samples)
+def cifar100_as_hafnia_dataset(force_redownload: bool = False, n_samples: Optional[int] = None) -> HafniaDataset:
+    return cifar_as_hafnia_dataset(dataset_name="cifar100", force_redownload=force_redownload, n_samples=n_samples)
+def caltech_101_as_hafnia_dataset(
+    force_redownload: bool = False,
+    n_samples: Optional[int] = None,
+) -> HafniaDataset:
+    dataset_name = "caltech-101"
+    path_image_classification_folder = _download_and_extract_caltech_dataset(
+        dataset_name, force_redownload=force_redownload
+    )
+    hafnia_dataset = import_image_classification_directory_tree(
+        path_image_classification_folder,
+        split=SplitName.TRAIN,
+        n_samples=n_samples,
+    )
+    hafnia_dataset.info.dataset_name = dataset_name
+    hafnia_dataset.info.version = "1.1.0"
+    hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
+        @article{FeiFei2004LearningGV,
+            title={Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian
+                   Approach Tested on 101 Object Categories},
+            author={Li Fei-Fei and Rob Fergus and Pietro Perona},
+            journal={Computer Vision and Pattern Recognition Workshop},
+            year={2004},
+        }
+        """)
+    hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/mzrjq-6wc02"
+    return hafnia_dataset
+def caltech_256_as_hafnia_dataset(
+    force_redownload: bool = False,
+    n_samples: Optional[int] = None,
+) -> HafniaDataset:
+    dataset_name = "caltech-256"
+    path_image_classification_folder = _download_and_extract_caltech_dataset(
+        dataset_name, force_redownload=force_redownload
+    )
+    hafnia_dataset = import_image_classification_directory_tree(
+        path_image_classification_folder,
+        split=SplitName.TRAIN,
+        n_samples=n_samples,
+    )
+    hafnia_dataset.info.dataset_name = dataset_name
+    hafnia_dataset.info.version = "1.1.0"
+    hafnia_dataset.info.reference_bibtex = textwrap.dedent("""\
+        @misc{griffin_2023_5sv1j-ytw97,
+            author       = {Griffin, Gregory and
+                            Holub, Alex and
+                            Perona, Pietro},
+            title        = {Caltech-256 Object Category Dataset},
+            month        = aug,
+            year         = 2023,
+            publisher    = {California Institute of Technology},
+            version      = {public},
+        }""")
+    hafnia_dataset.info.reference_dataset_page = "https://data.caltech.edu/records/nyy15-4j048"
+    return hafnia_dataset
+def cifar_as_hafnia_dataset(
+    dataset_name: str,
+    force_redownload: bool = False,
+    n_samples: Optional[int] = None,
+) -> HafniaDataset:
+    if dataset_name == "cifar10":
+        dataset_loader = tv_datasets.CIFAR10
+    elif dataset_name == "cifar100":
+        dataset_loader = tv_datasets.CIFAR100
+    else:
+        raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: cifar10, cifar100")
+    samples, tasks = torchvision_basic_image_classification_dataset_as_hafnia_dataset(
+        dataset_loader=dataset_loader,
+        force_redownload=force_redownload,
+        n_samples=n_samples,
+    )
+    dataset_info = DatasetInfo(
+        dataset_name=dataset_name,
+        version="1.1.0",
+        tasks=tasks,
+        reference_bibtex=textwrap.dedent("""\
+        @@TECHREPORT{Krizhevsky09learningmultiple,
+            author = {Alex Krizhevsky},
+            title = {Learning multiple layers of features from tiny images},
+            institution = {},
+            year = {2009}
+        }"""),
+        reference_paper_url="https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf",
+        reference_dataset_page="https://www.cs.toronto.edu/~kriz/cifar.html",
+    )
+    return HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
+def torchvision_basic_image_classification_dataset_as_hafnia_dataset(
+    dataset_loader: VisionDataset,
+    force_redownload: bool = False,
+    n_samples: Optional[int] = None,
+) -> Tuple[List[Sample], List[TaskInfo]]:
+    """
+    Converts a certain group of torchvision-based image classification datasets to a Hafnia Dataset.
+    This conversion only works for certain group of image classification VisionDataset by torchvision.
+    Common for these datasets is:
+    1) They provide a 'class_to_idx' mapping,
+    2) A "train" boolean parameter in the init function to separate training and test data - thus no validation split
+       is available for these datasets,
+    3) Datasets are in-memory and not on disk
+    4) Samples consist of a PIL image and a class index.
+    """
+    torchvision_dataset_name = dataset_loader.__name__
+    # Check if loader has train-parameter using inspect module
+    params = inspect.signature(dataset_loader).parameters
+    has_train_param = ("train" in params) and (params["train"].annotation is bool)
+    if not has_train_param:
+        raise ValueError(
+            f"The dataset loader '{dataset_loader.__name__}' does not have a 'train: bool' parameter in the init "
+            "function. This is a sign that the wrong dataset loader is being used. This conversion function only "
+            "works for certain image classification datasets provided by torchvision that are similar to e.g. "
+            "MNIST, CIFAR-10, CIFAR-100"
+        )
+    path_torchvision_dataset = utils.get_path_torchvision_downloads() / torchvision_dataset_name
+    path_hafnia_conversions = utils.get_path_hafnia_conversions() / torchvision_dataset_name
+    if force_redownload:
+        shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
+        shutil.rmtree(path_hafnia_conversions, ignore_errors=True)
+    splits = {
+        SplitName.TRAIN: dataset_loader(root=path_torchvision_dataset, train=True, download=True),
+        SplitName.TEST: dataset_loader(root=path_torchvision_dataset, train=False, download=True),
+    }
+    samples = []
+    n_samples_per_split = n_samples // len(splits) if n_samples is not None else None
+    for split_name, torchvision_dataset in splits.items():
+        class_name_to_index = torchvision_dataset.class_to_idx
+        class_index_to_name = {v: k for k, v in class_name_to_index.items()}
+        description = f"Convert '{torchvision_dataset_name}' ({split_name} split) to Hafnia Dataset "
+        samples_in_split = []
+        for image, class_idx in track(torchvision_dataset, total=n_samples_per_split, description=description):
+            (width, height) = image.size
+            path_image = save_pil_image_with_hash_name(image, path_hafnia_conversions)
+            sample = Sample(
+                file_path=str(path_image),
+                height=height,
+                width=width,
+                split=split_name,
+                classifications=[
+                    Classification(
+                        class_name=class_index_to_name[class_idx],
+                        class_idx=class_idx,
+                    )
+                ],
+            )
+            samples_in_split.append(sample)
+            if n_samples_per_split is not None and len(samples_in_split) >= n_samples_per_split:
+                break
+        samples.extend(samples_in_split)
+    class_names = list(class_name_to_index.keys())
+    tasks = [TaskInfo(primitive=Classification, class_names=class_names)]
+    return samples, tasks
+def _download_and_extract_caltech_dataset(dataset_name: str, force_redownload: bool) -> Path:
+    path_torchvision_dataset = utils.get_path_torchvision_downloads() / dataset_name
+    if force_redownload:
+        shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
+    if path_torchvision_dataset.exists():
+        return path_torchvision_dataset
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        path_tmp_output = Path(tmpdirname)
+        path_tmp_output.mkdir(parents=True, exist_ok=True)
+        if dataset_name == "caltech-101":
+            download_and_extract_archive(
+                "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
+                download_root=path_tmp_output,
+                filename="caltech-101.zip",
+                md5="3138e1922a9193bfa496528edbbc45d0",
+            )
+            path_output_extracted = path_tmp_output / "caltech-101"
+            for gzip_file in os.listdir(path_output_extracted):
+                if gzip_file.endswith(".gz"):
+                    extract_archive(os.path.join(path_output_extracted, gzip_file), path_output_extracted)
+            path_org = path_output_extracted / "101_ObjectCategories"
+        elif dataset_name == "caltech-256":
+            org_dataset_name = "256_ObjectCategories"
+            path_org = path_tmp_output / org_dataset_name
+            download_and_extract_archive(
+                url=f"https://data.caltech.edu/records/nyy15-4j048/files/{org_dataset_name}.tar",
+                download_root=path_tmp_output,
+                md5="67b4f42ca05d46448c6bb8ecd2220f6d",
+                remove_finished=True,
+            )
+        else:
+            raise ValueError(f"Unknown dataset name: {dataset_name}. Supported: caltech-101, caltech-256")
+        shutil.rmtree(path_torchvision_dataset, ignore_errors=True)
+        shutil.move(path_org, path_torchvision_dataset)
+    return path_torchvision_dataset

hafnia/dataset/hafnia_dataset.py CHANGED Viewed

@@ -8,14 +8,15 @@ from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from random import Random
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 import more_itertools
 import numpy as np
 import polars as pl
+from packaging.version import Version
 from PIL import Image
 from pydantic import BaseModel, Field, field_serializer, field_validator
-from tqdm import tqdm
+from rich.progress import track
 import hafnia
 from hafnia.dataset import dataset_helpers
@@ -29,10 +30,14 @@ from hafnia.dataset.dataset_names import (
     ColumnName,
     SplitName,
 )
-from hafnia.dataset.operations import dataset_stats, dataset_transformations, table_transformations
+from hafnia.dataset.operations import (
+    dataset_stats,
+    dataset_transformations,
+    table_transformations,
+)
 from hafnia.dataset.operations.table_transformations import (
     check_image_paths,
-    read_table_from_path,
+    read_samples_from_path,
 )
 from hafnia.dataset.primitives import PRIMITIVE_TYPES, get_primitive_type_from_string
 from hafnia.dataset.primitives.bbox import Bbox
@@ -44,9 +49,17 @@ from hafnia.log import user_logger
 class TaskInfo(BaseModel):
-    primitive: Type[Primitive]  # Primitive class or string name of the primitive, e.g. "Bbox" or "bitmask"
-    class_names: Optional[List[str]]  # Class names for the tasks. To get consistent class indices specify class_names.
-    name: Optional[str] = None  # Use 'None' to use default name Bbox ->"bboxes", Bitmask -> "bitmasks" etc.
+    primitive: Type[Primitive] = Field(
+        description="Primitive class or string name of the primitive, e.g. 'Bbox' or 'bitmask'"
+    )
+    class_names: Optional[List[str]] = Field(default=None, description="Optional list of class names for the primitive")
+    name: Optional[str] = Field(
+        default=None,
+        description=(
+            "Optional name for the task. 'None' will use default name of the provided primitive. "
+            "e.g. Bbox ->'bboxes', Bitmask -> 'bitmasks' etc."
+        ),
+    )
     def model_post_init(self, __context: Any) -> None:
         if self.name is None:
@@ -99,17 +112,37 @@ class TaskInfo(BaseModel):
 class DatasetInfo(BaseModel):
-    dataset_name: str
-    version: str  # Dataset version. This is not the same as the Hafnia dataset format version.
-    tasks: List[TaskInfo]
-    distributions: Optional[List[TaskInfo]] = None  # Distributions. TODO: FIX/REMOVE/CHANGE this
-    meta: Optional[Dict[str, Any]] = None  # Metadata about the dataset, e.g. description, etc.
-    format_version: str = hafnia.__dataset_format_version__  # Version of the Hafnia dataset format
-    updated_at: datetime = datetime.now()
+    dataset_name: str = Field(description="Name of the dataset, e.g. 'coco'")
+    version: Optional[str] = Field(default=None, description="Version of the dataset")
+    tasks: List[TaskInfo] = Field(default=None, description="List of tasks in the dataset")
+    distributions: Optional[List[TaskInfo]] = Field(default=None, description="Optional list of task distributions")
+    reference_bibtex: Optional[str] = Field(
+        default=None,
+        description="Optional, BibTeX reference to dataset publication",
+    )
+    reference_paper_url: Optional[str] = Field(
+        default=None,
+        description="Optional, URL to dataset publication",
+    )
+    reference_dataset_page: Optional[str] = Field(
+        default=None,
+        description="Optional, URL to the dataset page",
+    )
+    meta: Optional[Dict[str, Any]] = Field(default=None, description="Optional metadata about the dataset")
+    format_version: str = Field(
+        default=hafnia.__dataset_format_version__,
+        description="Version of the Hafnia dataset format. You should not set this manually.",
+    )
+    updated_at: datetime = Field(
+        default_factory=datetime.now,
+        description="Timestamp of the last update to the dataset info. You should not set this manually.",
+    )
     @field_validator("tasks", mode="after")
     @classmethod
-    def _validate_check_for_duplicate_tasks(cls, tasks: List[TaskInfo]) -> List[TaskInfo]:
+    def _validate_check_for_duplicate_tasks(cls, tasks: Optional[List[TaskInfo]]) -> List[TaskInfo]:
+        if tasks is None:
+            return []
         task_name_counts = collections.Counter(task.name for task in tasks)
         duplicate_task_names = [name for name, count in task_name_counts.items() if count > 1]
         if duplicate_task_names:
@@ -118,6 +151,35 @@ class DatasetInfo(BaseModel):
             )
         return tasks
+    @field_validator("format_version")
+    @classmethod
+    def _validate_format_version(cls, format_version: str) -> str:
+        try:
+            Version(format_version)
+        except Exception as e:
+            raise ValueError(f"Invalid format_version '{format_version}'. Must be a valid version string.") from e
+        if Version(format_version) > Version(hafnia.__dataset_format_version__):
+            user_logger.warning(
+                f"The loaded dataset format version '{format_version}' is newer than the format version "
+                f"'{hafnia.__dataset_format_version__}' used in your version of Hafnia. Please consider "
+                f"updating Hafnia package."
+            )
+        return format_version
+    @field_validator("version")
+    @classmethod
+    def _validate_version(cls, dataset_version: Optional[str]) -> Optional[str]:
+        if dataset_version is None:
+            return None
+        try:
+            Version(dataset_version)
+        except Exception as e:
+            raise ValueError(f"Invalid dataset_version '{dataset_version}'. Must be a valid version string.") from e
+        return dataset_version
     def check_for_duplicate_task_names(self) -> List[TaskInfo]:
         return self._validate_check_for_duplicate_tasks(self.tasks)
@@ -187,7 +249,7 @@ class DatasetInfo(BaseModel):
         meta.update(info1.meta or {})
         return DatasetInfo(
             dataset_name=info0.dataset_name + "+" + info1.dataset_name,
-            version="merged",
+            version=None,
             tasks=list(unique_tasks),
             distributions=list(distributions),
             meta=meta,
@@ -258,22 +320,40 @@ class DatasetInfo(BaseModel):
 class Sample(BaseModel):
-    file_name: str
-    height: int
-    width: int
-    split: str  # Split name, e.g., "train", "val", "test"
-    tags: List[str] = []  # tags for a given sample. Used for creating subsets of the dataset.
-    collection_index: Optional[int] = None  # Optional e.g. frame number for video datasets
-    collection_id: Optional[str] = None  # Optional e.g. video name for video datasets
-    remote_path: Optional[str] = None  # Optional remote path for the image, if applicable
-    sample_index: Optional[int] = None  # Don't manually set this, it is used for indexing samples in the dataset.
-    classifications: Optional[List[Classification]] = None  # Optional classification primitive
-    objects: Optional[List[Bbox]] = None  # List of coordinate primitives, e.g., Bbox, Bitmask, etc.
-    bitmasks: Optional[List[Bitmask]] = None  # List of bitmasks, if applicable
-    polygons: Optional[List[Polygon]] = None  # List of polygons, if applicable
-    attribution: Optional[Attribution] = None  # Attribution information for the image
-    meta: Optional[Dict] = None  # Additional metadata, e.g., camera settings, GPS data, etc.
+    file_path: str = Field(description="Path to the image file")
+    height: int = Field(description="Height of the image")
+    width: int = Field(description="Width of the image")
+    split: str = Field(description="Split name, e.g., 'train', 'val', 'test'")
+    tags: List[str] = Field(
+        default_factory=list,
+        description="Tags for a given sample. Used for creating subsets of the dataset.",
+    )
+    collection_index: Optional[int] = Field(default=None, description="Optional e.g. frame number for video datasets")
+    collection_id: Optional[str] = Field(default=None, description="Optional e.g. video name for video datasets")
+    remote_path: Optional[str] = Field(default=None, description="Optional remote path for the image, if applicable")
+    sample_index: Optional[int] = Field(
+        default=None,
+        description="Don't manually set this, it is used for indexing samples in the dataset.",
+    )
+    classifications: Optional[List[Classification]] = Field(
+        default=None, description="Optional list of classifications"
+    )
+    objects: Optional[List[Bbox]] = Field(default=None, description="Optional list of objects (bounding boxes)")
+    bitmasks: Optional[List[Bitmask]] = Field(default=None, description="Optional list of bitmasks")
+    polygons: Optional[List[Polygon]] = Field(default=None, description="Optional list of polygons")
+    attribution: Optional[Attribution] = Field(default=None, description="Attribution information for the image")
+    dataset_name: Optional[str] = Field(
+        default=None,
+        description=(
+            "Don't manually set this, it will be automatically defined during initialization. "
+            "Name of the dataset the sample belongs to. E.g. 'coco-2017' or 'midwest-vehicle-detection'."
+        ),
+    )
+    meta: Optional[Dict] = Field(
+        default=None,
+        description="Additional metadata, e.g., camera settings, GPS data, etc.",
+    )
     def get_annotations(self, primitive_types: Optional[List[Type[Primitive]]] = None) -> List[Primitive]:
         """
@@ -294,7 +374,7 @@ class Sample(BaseModel):
         Reads the image from the file path and returns it as a PIL Image.
         Raises FileNotFoundError if the image file does not exist.
         """
-        path_image = Path(self.file_name)
+        path_image = Path(self.file_path)
         if not path_image.exists():
             raise FileNotFoundError(f"Image file {path_image} does not exist. Please check the file path.")
@@ -413,30 +493,23 @@ class HafniaDataset:
             yield row
     def __post_init__(self):
-        samples = self.samples
-        if ColumnName.SAMPLE_INDEX not in samples.columns:
-            samples = samples.with_row_index(name=ColumnName.SAMPLE_INDEX)
-        # Backwards compatibility: If tags-column doesn't exist, create it with empty lists
-        if ColumnName.TAGS not in samples.columns:
-            tags_column: List[List[str]] = [[] for _ in range(len(self))]  # type: ignore[annotation-unchecked]
-            samples = samples.with_columns(pl.Series(tags_column, dtype=pl.List(pl.String)).alias(ColumnName.TAGS))
-        self.samples = samples
+        self.samples, self.info = _dataset_corrections(self.samples, self.info)
     @staticmethod
     def from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
+        path_folder = Path(path_folder)
         HafniaDataset.check_dataset_path(path_folder, raise_error=True)
         dataset_info = DatasetInfo.from_json_file(path_folder / FILENAME_DATASET_INFO)
-        table = read_table_from_path(path_folder)
+        samples = read_samples_from_path(path_folder)
+        samples, dataset_info = _dataset_corrections(samples, dataset_info)
         # Convert from relative paths to absolute paths
         dataset_root = path_folder.absolute().as_posix() + "/"
-        table = table.with_columns((dataset_root + pl.col("file_name")).alias("file_name"))
+        samples = samples.with_columns((dataset_root + pl.col(ColumnName.FILE_PATH)).alias(ColumnName.FILE_PATH))
         if check_for_images:
-            check_image_paths(table)
-        return HafniaDataset(samples=table, info=dataset_info)
+            check_image_paths(samples)
+        return HafniaDataset(samples=samples, info=dataset_info)
     @staticmethod
     def from_name(name: str, force_redownload: bool = False, download_files: bool = True) -> "HafniaDataset":
@@ -464,6 +537,14 @@ class HafniaDataset:
         table = pl.from_records(json_samples)
         table = table.drop(ColumnName.SAMPLE_INDEX).with_row_index(name=ColumnName.SAMPLE_INDEX)
+        # Add 'dataset_name' to samples
+        table = table.with_columns(
+            pl.when(pl.col(ColumnName.DATASET_NAME).is_null())
+            .then(pl.lit(info.dataset_name))
+            .otherwise(pl.col(ColumnName.DATASET_NAME))
+            .alias(ColumnName.DATASET_NAME)
+        )
         return HafniaDataset(info=info, samples=table)
     @staticmethod
@@ -518,6 +599,28 @@ class HafniaDataset:
             merged_dataset = HafniaDataset.merge(merged_dataset, dataset)
         return merged_dataset
+    @staticmethod
+    def from_name_public_dataset(
+        name: str,
+        force_redownload: bool = False,
+        n_samples: Optional[int] = None,
+    ) -> HafniaDataset:
+        from hafnia.dataset.format_conversions.torchvision_datasets import (
+            torchvision_to_hafnia_converters,
+        )
+        name_to_torchvision_function = torchvision_to_hafnia_converters()
+        if name not in name_to_torchvision_function:
+            raise ValueError(
+                f"Unknown torchvision dataset name: {name}. Supported: {list(name_to_torchvision_function.keys())}"
+            )
+        vision_dataset = name_to_torchvision_function[name]
+        return vision_dataset(
+            force_redownload=force_redownload,
+            n_samples=n_samples,
+        )
     def shuffle(dataset: HafniaDataset, seed: int = 42) -> HafniaDataset:
         table = dataset.samples.sample(n=len(dataset), with_replacement=False, seed=seed, shuffle=True)
         return dataset.update_samples(table)
@@ -607,7 +710,7 @@ class HafniaDataset:
     def class_mapper(
         dataset: "HafniaDataset",
-        class_mapping: Dict[str, str],
+        class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
         method: str = "strict",
         primitive: Optional[Type[Primitive]] = None,
         task_name: Optional[str] = None,
@@ -778,13 +881,14 @@ class HafniaDataset:
             path_folder.mkdir(parents=True)
         new_relative_paths = []
-        for org_path in tqdm(self.samples["file_name"].to_list(), desc="- Copy images"):
+        org_paths = self.samples[ColumnName.FILE_PATH].to_list()
+        for org_path in track(org_paths, description="- Copy images"):
             new_path = dataset_helpers.copy_and_rename_file_to_hash_value(
                 path_source=Path(org_path),
                 path_dataset_root=path_folder,
             )
             new_relative_paths.append(str(new_path.relative_to(path_folder)))
-        table = self.samples.with_columns(pl.Series(new_relative_paths).alias("file_name"))
+        table = self.samples.with_columns(pl.Series(new_relative_paths).alias(ColumnName.FILE_PATH))
         if drop_null_cols:  # Drops all unused/Null columns
             table = table.drop(pl.selectors.by_dtype(pl.Null))
@@ -846,3 +950,25 @@ def get_or_create_dataset_path_from_recipe(
     dataset.write(path_dataset)
     return path_dataset
+def _dataset_corrections(samples: pl.DataFrame, dataset_info: DatasetInfo) -> Tuple[pl.DataFrame, DatasetInfo]:
+    format_version_of_dataset = Version(dataset_info.format_version)
+    ## Backwards compatibility fixes for older dataset versions
+    if format_version_of_dataset <= Version("0.3.0"):
+        if ColumnName.DATASET_NAME not in samples.columns:
+            samples = samples.with_columns(pl.lit(dataset_info.dataset_name).alias(ColumnName.DATASET_NAME))
+        if "file_name" in samples.columns:
+            samples = samples.rename({"file_name": ColumnName.FILE_PATH})
+        if ColumnName.SAMPLE_INDEX not in samples.columns:
+            samples = samples.with_row_index(name=ColumnName.SAMPLE_INDEX)
+        # Backwards compatibility: If tags-column doesn't exist, create it with empty lists
+        if ColumnName.TAGS not in samples.columns:
+            tags_column: List[List[str]] = [[] for _ in range(len(samples))]  # type: ignore[annotation-unchecked]
+            samples = samples.with_columns(pl.Series(tags_column, dtype=pl.List(pl.String)).alias(ColumnName.TAGS))
+    return samples, dataset_info

hafnia/dataset/operations/dataset_stats.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Dict, Optional, Type
 import polars as pl
 import rich
 from rich import print as rprint
+from rich.progress import track
 from rich.table import Table
-from tqdm import tqdm
 from hafnia.dataset.dataset_names import ColumnName, FieldName, SplitName
 from hafnia.dataset.operations.table_transformations import create_primitive_table
@@ -179,7 +179,6 @@ def check_dataset(dataset: HafniaDataset):
     from hafnia.dataset.hafnia_dataset import Sample
     user_logger.info("Checking Hafnia dataset...")
-    assert isinstance(dataset.info.version, str) and len(dataset.info.version) > 0
     assert isinstance(dataset.info.dataset_name, str) and len(dataset.info.dataset_name) > 0
     sample_dataset = dataset.create_sample_dataset()
@@ -215,7 +214,7 @@ def check_dataset(dataset: HafniaDataset):
                 f"classes: {class_names}. "
             )
-    for sample_dict in tqdm(dataset, desc="Checking samples in dataset"):
+    for sample_dict in track(dataset, description="Checking samples in dataset"):
         sample = Sample(**sample_dict)  # noqa: F841

hafnia 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

hafnia 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl