PyPI - hafnia - Versions diffs - 0.4.3__tar.gz → 0.5.0__tar.gz - Mend

hafnia 0.4.3tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (178) hide show

{hafnia-0.4.3 → hafnia-0.5.0}/.github/workflows/tests.yaml RENAMED Viewed

@@ -32,6 +32,10 @@ jobs:
         run: |
           mkdir -p ~/.hafnia
           echo "$HAFNIA_CONFIG" | jq . > ~/.hafnia/config.json
+      - name: Check hafnia configured
+        run: uv run hafnia profile active
+      - name: Check hafnia by download
+        run: uv run hafnia dataset download mnist --force
       - name: Run tests
         run: uv run pytest tests

{hafnia-0.4.3 → hafnia-0.5.0}/.vscode/settings.json RENAMED Viewed

@@ -23,7 +23,7 @@
     "python.testing.pytestArgs": [
         "tests",
         "-vv",
-        "--durations=20",
+        "--durations=20"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,

{hafnia-0.4.3 → hafnia-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hafnia
-Version: 0.4.3
+Version: 0.5.0
 Summary: Python SDK for communication with Hafnia platform.
 Author-email: Milestone Systems <hafniaplatform@milestone.dk>
 License-File: LICENSE
@@ -10,7 +10,7 @@ Requires-Dist: click>=8.1.8
 Requires-Dist: emoji>=2.14.1
 Requires-Dist: flatten-dict>=0.4.2
 Requires-Dist: keyring>=25.6.0
-Requires-Dist: mcp==1.16.0
+Requires-Dist: mcp>=1.16.0
 Requires-Dist: mlflow>=3.4.0
 Requires-Dist: more-itertools>=10.7.0
 Requires-Dist: opencv-python-headless>=4.11.0.86
@@ -209,7 +209,7 @@ DatasetInfo(
 ```
 You can iterate and access samples in the dataset using the `HafniaDataset` object.
-Each sample contain image and annotations information.
+Each sample contain image and annotations information.
 ```python
 from hafnia.dataset.hafnia_dataset import HafniaDataset, Sample

{hafnia-0.4.3 → hafnia-0.5.0}/README.md RENAMED Viewed

@@ -180,7 +180,7 @@ DatasetInfo(
 ```
 You can iterate and access samples in the dataset using the `HafniaDataset` object.
-Each sample contain image and annotations information.
+Each sample contain image and annotations information.
 ```python
 from hafnia.dataset.hafnia_dataset import HafniaDataset, Sample

{hafnia-0.4.3 → hafnia-0.5.0}/examples/example_hafnia_dataset.py RENAMED Viewed

@@ -138,43 +138,58 @@ path_tmp.mkdir(parents=True, exist_ok=True)
 Image.fromarray(image_with_annotations).save(path_tmp / "sample_with_annotations.png")
-## Bring-your-own-data: Create a new dataset from samples
+## Create a hafnia dataset from scratch ##
+path_yolo_dataset = Path("tests/data/dataset_formats/format_yolo/train")
+path_class_names = path_yolo_dataset.parent / "obj.names"
+class_names = [line.strip() for line in path_class_names.read_text().splitlines() if line.strip()]
+path_images_file = path_yolo_dataset / "images.txt"
+image_files = [line.strip() for line in path_images_file.read_text().splitlines() if line.strip()]
 fake_samples = []
-for i_fake_sample in range(5):
-    bboxes = [Bbox(top_left_x=0.1, top_left_y=0.20, width=0.1, height=0.2, class_name="car")]
-    classifications = [Classification(class_name="vehicle", class_idx=0)]
-    sample = Sample(
-        file_path=f"path/to/image_{i_fake_sample:05}.jpg",
-        height=480,
-        width=640,
-        split="train",
-        tags=["sample"],
-        bboxes=bboxes,
-        classifications=classifications,
-    )
+for image_file in image_files:
+    path_image = path_yolo_dataset / image_file
+    path_bboxes = path_yolo_dataset / image_file.replace(".jpg", ".txt")
+    bboxes: List[Bbox] = []
+    for bboxes_line in path_bboxes.read_text().splitlines():
+        str_parts = bboxes_line.strip().split()
+        class_idx = int(str_parts[0])
+        x_center, y_center, bbox_width, bbox_height = (float(value) for value in str_parts[1:5])
+        bbox = Bbox(
+            top_left_x=x_center - bbox_width / 2,
+            top_left_y=y_center - bbox_height / 2,
+            width=bbox_width,
+            height=bbox_height,
+            class_idx=class_idx,
+            class_name=class_names[class_idx],
+        )
+        bboxes.append(bbox)
+    image = Image.open(path_image)
+    height, width = image.size[1], image.size[0]
+    sample = Sample(file_path=str(path_image), height=height, width=width, split="train", bboxes=bboxes)
     fake_samples.append(sample)
 fake_dataset_info = DatasetInfo(
-    dataset_name="fake-dataset",
+    dataset_name="custom-dataset",
     version="0.0.1",
-    tasks=[
-        TaskInfo(primitive=Bbox, class_names=["car", "truck", "bus"]),
-        TaskInfo(primitive=Classification, class_names=["vehicle", "pedestrian", "cyclist"]),
-    ],
+    tasks=[TaskInfo(primitive=Bbox, class_names=class_names)],
 )
-fake_dataset = HafniaDataset.from_samples_list(samples_list=fake_samples, info=fake_dataset_info)
+custom_dataset = HafniaDataset.from_samples_list(samples_list=fake_samples, info=fake_dataset_info)
+sample = Sample(**custom_dataset[0])
+# To visualize and verify dataset is formatted correctly store image with annotations
+image_with_annotations = sample.draw_annotations()
+Image.fromarray(image_with_annotations).save(path_tmp / "custom_dataset_sample.png")  # Save visualization to TM
-# Coming soon! Upload your dataset to the Hafnia Platform
-# fake_dataset.upload_to_hafnia()
+# To upload the dataset to Hafnia platform
+# custom_dataset.upload_to_platform(interactive=True, allow_version_overwrite=False)
-# Coming soon! Create your own dataset details page in Hafnia
-# fake_dataset.upload_dataset_details()
 ## Storing predictions: A hafnia dataset can also be used for storing predictions per sample
 # set 'ground_truth=False' and add 'confidence'.
 bboxes_predictions = [
-    Bbox(top_left_x=10, top_left_y=20, width=100, height=200, class_name="car", ground_truth=False, confidence=0.9)
+    Bbox(top_left_x=0.1, top_left_y=0.2, width=0.3, height=0.4, class_name="car", ground_truth=False, confidence=0.9)
 ]
 classifications_predictions = [Classification(class_name="vehicle", class_idx=0, ground_truth=False, confidence=0.95)]

{hafnia-0.4.3 → hafnia-0.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hafnia"
-version = "0.4.3"
+version = "0.5.0"
 description = "Python SDK for communication with Hafnia platform."
 readme = "README.md"
 authors = [
@@ -28,7 +28,7 @@ dependencies = [
     "xxhash>=3.5.0",
     "mlflow>=3.4.0",
     "sagemaker-mlflow>=0.1.0",
-    "mcp==1.16.0",
+    "mcp>=1.16.0",
 ]
 [dependency-groups]

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/dataset_details_uploader.py RENAMED Viewed

@@ -4,7 +4,7 @@ import base64
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Type, Union
 import boto3
 import polars as pl
@@ -13,7 +13,6 @@ from pydantic import BaseModel, ConfigDict, field_validator
 from hafnia.dataset.dataset_names import (
     DatasetVariant,
-    DeploymentStage,
     PrimitiveField,
     SampleField,
     SplitName,
@@ -29,26 +28,21 @@ from hafnia.dataset.primitives import (
     Segmentation,
 )
 from hafnia.dataset.primitives.primitive import Primitive
-from hafnia.http import post
-from hafnia.log import user_logger
-from hafnia.platform.datasets import get_dataset_id
+from hafnia.platform.datasets import upload_dataset_details
+from hafnia.utils import get_path_dataset_gallery_images
 from hafnia_cli.config import Config
-def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
-    # TODO: When moving to versioning we do NOT need 'staging' and 'production' specific buckets
-    # and the new name convention should be: f"hafnia-dataset-{dataset_name}"
-    return f"mdi-{deployment_stage.value}-{dataset_name}"
 class DatasetDetails(BaseModel, validate_assignment=True):  # type: ignore[call-arg]
     model_config = ConfigDict(use_enum_values=True)  # To parse Enum values as strings
     name: str
+    title: Optional[str] = None
+    overview: Optional[str] = None
     data_captured_start: Optional[datetime] = None
     data_captured_end: Optional[datetime] = None
     data_received_start: Optional[datetime] = None
     data_received_end: Optional[datetime] = None
-    latest_update: Optional[datetime] = None
+    dataset_updated_at: Optional[datetime] = None
     license_citation: Optional[str] = None
     version: Optional[str] = None
     s3_bucket_name: Optional[str] = None
@@ -281,26 +275,32 @@ def get_folder_size(path: Path) -> int:
     return sum([path.stat().st_size for path in path.rglob("*")])
-def upload_to_hafnia_dataset_detail_page(dataset_update: DatasetDetails, upload_gallery_images: bool) -> dict:
-    if not upload_gallery_images:
-        dataset_update.imgs = None
-    cfg = Config()
-    dataset_details = dataset_update.model_dump_json()
-    data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
-    return data
-def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
-    dataset_endpoint = cfg.get_platform_endpoint("datasets")
-    dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
+def upload_dataset_details_to_platform(
+    dataset: HafniaDataset,
+    path_gallery_images: Optional[Path] = None,
+    gallery_image_names: Optional[List[str]] = None,
+    distribution_task_names: Optional[List[str]] = None,
+    update_platform: bool = True,
+    cfg: Optional[Config] = None,
+) -> dict:
+    cfg = cfg or Config()
+    dataset_details = dataset_details_from_hafnia_dataset(
+        dataset=dataset,
+        path_gallery_images=path_gallery_images,
+        gallery_image_names=gallery_image_names,
+        distribution_task_names=distribution_task_names,
+    )
-    import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
-    headers = {"Authorization": cfg.api_key}
+    if update_platform:
+        dataset_details_exclude_none = dataset_details.model_dump(exclude_none=True, mode="json")
+        upload_dataset_details(
+            cfg=cfg,
+            data=dataset_details_exclude_none,
+            dataset_name=dataset_details.name,
+        )
-    user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
-    response = post(endpoint=import_endpoint, headers=headers, data=data)  # type: ignore[assignment]
-    return response  # type: ignore[return-value]
+    dataset_details_dict = dataset_details.model_dump(exclude_none=False, mode="json")
+    return dataset_details_dict
 def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -360,9 +360,6 @@ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: bot
 def dataset_details_from_hafnia_dataset(
     dataset: HafniaDataset,
-    deployment_stage: DeploymentStage,
-    path_sample: Optional[Path],
-    path_hidden: Optional[Path],
     path_gallery_images: Optional[Path] = None,
     gallery_image_names: Optional[List[str]] = None,
     distribution_task_names: Optional[List[str]] = None,
@@ -371,33 +368,24 @@ def dataset_details_from_hafnia_dataset(
     dataset_reports = []
     dataset_meta_info = dataset.info.meta or {}
-    path_and_variant: List[Tuple[Path, DatasetVariant]] = []
-    if path_sample is not None:
-        path_and_variant.append((path_sample, DatasetVariant.SAMPLE))
-    if path_hidden is not None:
-        path_and_variant.append((path_hidden, DatasetVariant.HIDDEN))
-    if len(path_and_variant) == 0:
-        raise ValueError("At least one path must be provided for sample or hidden dataset.")
+    path_and_variant = [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]
     gallery_images = create_gallery_images(
         dataset=dataset,
         path_gallery_images=path_gallery_images,
         gallery_image_names=gallery_image_names,
     )
-    for path_dataset, variant_type in path_and_variant:
+    for variant_type in path_and_variant:
         if variant_type == DatasetVariant.SAMPLE:
             dataset_variant = dataset.create_sample_dataset()
         else:
             dataset_variant = dataset
-        size_bytes = get_folder_size(path_dataset)
+        files_paths = dataset_variant.samples[SampleField.FILE_PATH].to_list()
+        size_bytes = sum([Path(file_path).stat().st_size for file_path in files_paths])
         dataset_variants.append(
             DbDatasetVariant(
                 variant_type=VARIANT_TYPE_MAPPING[variant_type],  # type: ignore[index]
-                # upload_date: Optional[datetime] = None
                 size_bytes=size_bytes,
                 data_type=DataTypeChoices.images,
                 number_of_data_items=len(dataset_variant),
@@ -405,7 +393,6 @@ def dataset_details_from_hafnia_dataset(
                 duration=dataset_meta_info.get("duration", None),
                 duration_average=dataset_meta_info.get("duration_average", None),
                 frame_rate=dataset_meta_info.get("frame_rate", None),
-                # bit_rate: Optional[float] = None
                 n_cameras=dataset_meta_info.get("n_cameras", None),
             )
         )
@@ -435,19 +422,19 @@ def dataset_details_from_hafnia_dataset(
             object_reports = sorted(object_reports, key=lambda x: x.obj.name)  # Sort object reports by name
             report.annotated_object_reports = object_reports
-        if report.distribution_values is None:
-            report.distribution_values = []
+            if report.distribution_values is None:
+                report.distribution_values = []
-        dataset_reports.append(report)
+            dataset_reports.append(report)
     dataset_name = dataset.info.dataset_name
-    bucket_sample = generate_bucket_name(dataset_name, deployment_stage=deployment_stage)
     dataset_info = DatasetDetails(
         name=dataset_name,
+        title=dataset.info.dataset_title,
+        overview=dataset.info.description,
         version=dataset.info.version,
-        s3_bucket_name=bucket_sample,
         dataset_variants=dataset_variants,
         split_annotations_reports=dataset_reports,
-        latest_update=dataset.info.updated_at,
+        dataset_updated_at=dataset.info.updated_at,
         dataset_format_version=dataset.info.format_version,
         license_citation=dataset.info.reference_bibtex,
         data_captured_start=dataset_meta_info.get("data_captured_start", None),
@@ -565,7 +552,7 @@ def create_gallery_images(
     gallery_images = None
     if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
         if path_gallery_images is None:
-            raise ValueError("Path to gallery images must be provided.")
+            path_gallery_images = get_path_dataset_gallery_images(dataset.info.dataset_name)
         path_gallery_images.mkdir(parents=True, exist_ok=True)
         COL_IMAGE_NAME = "image_name"
         samples = dataset.samples.with_columns(

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/dataset_helpers.py RENAMED Viewed

@@ -57,20 +57,6 @@ def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_s
 def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
     """
     Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
-    E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
-    'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
-    Notice that the hash is used for both the filename and the subfolder name.
-    Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
-    unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
-    The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
-    download 3500 files per second from a single folder (prefix) in S3.
-    For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
-    in S3. To support multiple users and concurrent experiments, we are required to separate files into
-    multiple sub-folders (prefixes) in S3 to not hit the rate limit.
     """
     if not path_source.exists():
@@ -86,7 +72,7 @@ def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Pat
 def relative_path_from_hash(hash: str, suffix: str) -> Path:
-    path_file = Path("data") / hash[:3] / f"{hash}{suffix}"
+    path_file = Path("data") / f"{hash}{suffix}"
     return path_file

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/dataset_names.py RENAMED Viewed

@@ -2,6 +2,7 @@ from enum import Enum
 from typing import Dict, List, Optional
 import boto3
+from botocore.exceptions import UnauthorizedSSOTokenError
 from pydantic import BaseModel, field_validator
 FILENAME_RECIPE_JSON = "recipe.json"
@@ -21,6 +22,7 @@ class DeploymentStage(Enum):
     PRODUCTION = "production"
+ARN_PREFIX = "arn:aws:s3:::"
 TAG_IS_SAMPLE = "sample"
 OPS_REMOVE_CLASS = "__REMOVE__"
@@ -151,7 +153,14 @@ class AwsCredentials(BaseModel):
         """
         Creates AwsCredentials from a Boto3 session.
         """
-        frozen_credentials = session.get_credentials().get_frozen_credentials()
+        try:
+            frozen_credentials = session.get_credentials().get_frozen_credentials()
+        except UnauthorizedSSOTokenError as e:
+            raise RuntimeError(
+                f"Failed to get AWS credentials from the session for profile '{session.profile_name}'.\n"
+                f"Ensure the profile exists in your AWS config in '~/.aws/config' and that you are logged in via AWS SSO.\n"
+                f"\tUse 'aws sso login --profile {session.profile_name}' to log in."
+            ) from e
         return AwsCredentials(
             access_key=frozen_credentials.access_key,
             secret_key=frozen_credentials.secret_key,
@@ -159,8 +168,13 @@ class AwsCredentials(BaseModel):
             region=session.region_name,
         )
-ARN_PREFIX = "arn:aws:s3:::"
+    def to_resource_credentials(self, bucket_name: str) -> "ResourceCredentials":
+        """
+        Converts AwsCredentials to ResourceCredentials by adding the S3 ARN.
+        """
+        payload = self.model_dump()
+        payload["s3_arn"] = f"{ARN_PREFIX}{bucket_name}"
+        return ResourceCredentials(**payload)
 class ResourceCredentials(AwsCredentials):

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/format_conversions/torchvision_datasets.py RENAMED Viewed

@@ -40,7 +40,7 @@ def mnist_as_hafnia_dataset(force_redownload=False, n_samples: Optional[int] = N
     dataset_info = DatasetInfo(
         dataset_name="mnist",
-        version="1.1.0",
+        version="1.0.0",
         tasks=tasks,
         reference_bibtex=textwrap.dedent("""\
             @article{lecun2010mnist,
@@ -150,7 +150,7 @@ def cifar_as_hafnia_dataset(
     dataset_info = DatasetInfo(
         dataset_name=dataset_name,
-        version="1.1.0",
+        version="1.0.0",
         tasks=tasks,
         reference_bibtex=textwrap.dedent("""\
         @@TECHREPORT{Krizhevsky09learningmultiple,
@@ -268,7 +268,10 @@ def _download_and_extract_caltech_dataset(dataset_name: str, force_redownload: b
             path_output_extracted = path_tmp_output / "caltech-101"
             for gzip_file in os.listdir(path_output_extracted):
                 if gzip_file.endswith(".gz"):
-                    extract_archive(os.path.join(path_output_extracted, gzip_file), path_output_extracted)
+                    extract_archive(
+                        from_path=os.path.join(path_output_extracted, gzip_file),
+                        to_path=path_output_extracted,
+                    )
             path_org = path_output_extracted / "101_ObjectCategories"
         elif dataset_name == "caltech-256":

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/hafnia_dataset.py RENAMED Viewed

@@ -12,7 +12,6 @@ from packaging.version import Version
 from hafnia.dataset import dataset_helpers
 from hafnia.dataset.dataset_names import (
-    DATASET_FILENAMES_REQUIRED,
     FILENAME_ANNOTATIONS_JSONL,
     FILENAME_ANNOTATIONS_PARQUET,
     FILENAME_DATASET_INFO,
@@ -38,6 +37,7 @@ from hafnia.dataset.operations import (
 from hafnia.dataset.primitives.primitive import Primitive
 from hafnia.log import user_logger
 from hafnia.utils import progress_bar
+from hafnia_cli.config import Config
 @dataclass
@@ -434,7 +434,7 @@ class HafniaDataset:
         aws_credentials: AwsCredentials,
         force_redownload: bool = False,
     ) -> HafniaDataset:
-        from hafnia.platform.datasets import fast_copy_files_s3
+        from hafnia.platform.s5cmd_utils import fast_copy_files
         remote_src_paths = dataset.samples[SampleField.REMOTE_PATH].unique().to_list()
         update_rows = []
@@ -470,7 +470,7 @@ class HafniaDataset:
             return dataset
         environment_vars = aws_credentials.aws_credentials()
-        fast_copy_files_s3(
+        fast_copy_files(
             src_paths=remote_src_paths,
             dst_paths=local_dst_paths,
             append_envs=environment_vars,
@@ -563,7 +563,7 @@ class HafniaDataset:
             keep_sample_data=keep_sample_data,
         )
-    def write(self, path_folder: Path, add_version: bool = False, drop_null_cols: bool = True) -> None:
+    def write(self, path_folder: Path, drop_null_cols: bool = True) -> None:
         user_logger.info(f"Writing dataset to {path_folder}...")
         path_folder = path_folder.absolute()
         if not path_folder.exists():
@@ -578,18 +578,9 @@ class HafniaDataset:
             )
             new_paths.append(str(new_path))
         hafnia_dataset.samples = hafnia_dataset.samples.with_columns(pl.Series(new_paths).alias(SampleField.FILE_PATH))
-        hafnia_dataset.write_annotations(
-            path_folder=path_folder,
-            drop_null_cols=drop_null_cols,
-            add_version=add_version,
-        )
+        hafnia_dataset.write_annotations(path_folder=path_folder, drop_null_cols=drop_null_cols)
-    def write_annotations(
-        dataset: HafniaDataset,
-        path_folder: Path,
-        drop_null_cols: bool = True,
-        add_version: bool = False,
-    ) -> None:
+    def write_annotations(dataset: HafniaDataset, path_folder: Path, drop_null_cols: bool = True) -> None:
         """
         Writes only the annotations files (JSONL and Parquet) to the specified folder.
         """
@@ -604,18 +595,102 @@ class HafniaDataset:
             samples = samples.drop(pl.selectors.by_dtype(pl.Null))
         # Store only relative paths in the annotations files
-        absolute_paths = samples[SampleField.FILE_PATH].to_list()
-        relative_paths = [str(Path(path).relative_to(path_folder)) for path in absolute_paths]
-        samples = samples.with_columns(pl.Series(relative_paths).alias(SampleField.FILE_PATH))
+        if SampleField.FILE_PATH in samples.columns:  # We drop column for remote datasets
+            absolute_paths = samples[SampleField.FILE_PATH].to_list()
+            relative_paths = [str(Path(path).relative_to(path_folder)) for path in absolute_paths]
+            samples = samples.with_columns(pl.Series(relative_paths).alias(SampleField.FILE_PATH))
+        else:
+            samples = samples.with_columns(pl.lit("").alias(SampleField.FILE_PATH))
         samples.write_ndjson(path_folder / FILENAME_ANNOTATIONS_JSONL)  # Json for readability
         samples.write_parquet(path_folder / FILENAME_ANNOTATIONS_PARQUET)  # Parquet for speed
-        if add_version:
-            path_version = path_folder / "versions" / f"{dataset.info.version}"
-            path_version.mkdir(parents=True, exist_ok=True)
-            for filename in DATASET_FILENAMES_REQUIRED:
-                shutil.copy2(path_folder / filename, path_version / filename)
+    def delete_on_platform(dataset: HafniaDataset, interactive: bool = True) -> None:
+        """
+        Delete this dataset from the Hafnia platform.
+        This is a thin wrapper around `hafnia.platform.datasets.delete_dataset_completely_by_name`.
+        Args:
+            dataset (HafniaDataset): The :class:`HafniaDataset` instance to delete from the platform. The
+                dataset name is taken from `dataset.info.dataset_name`.
+            interactive (bool): If ``True``, perform the deletion in interactive mode (for example,
+                prompting the user for confirmation where supported). If ``False``,
+                run non-interactively, suitable for automated scripts or CI usage. Defaults to True.
+        """
+        from hafnia.platform.datasets import delete_dataset_completely_by_name
+        delete_dataset_completely_by_name(dataset_name=dataset.info.dataset_name, interactive=interactive)
+    def upload_to_platform(
+        dataset: HafniaDataset,
+        dataset_sample: Optional[HafniaDataset] = None,
+        allow_version_overwrite: bool = False,
+        interactive: bool = True,
+        gallery_images: Optional[Any] = None,
+        distribution_task_names: Optional[List[str]] = None,
+        cfg: Optional[Config] = None,
+    ) -> dict:
+        """
+        Upload the dataset and dataset details to the Hafnia platform.
+        This method ensures the dataset exists on the platform, synchronizes the
+        dataset files to remote storage, and uploads dataset details and optional gallery images
+        distributions.
+        Args:
+            dataset: The full :class:`HafniaDataset` instance that should be uploaded
+                to the platform.
+            dataset_sample: Optional sample :class:`HafniaDataset` used as a smaller
+                preview or subset of the main dataset on the platform. If provided,
+                it is uploaded alongside the full dataset for demonstration or
+                inspection purposes. Use only this if the sample dataset uses different
+                image files than the main dataset. Otherwise it is sufficient to just provide
+                the main dataset and the platform will create a sample automatically.
+            allow_version_overwrite: If ``True``, allows an existing dataset version
+                with the same name to be overwritten on the platform. If ``False``,
+                an error or confirmation may be required when a version conflict is
+                detected.
+            interactive: If ``True``, the upload process may prompt the user for
+                confirmation or additional input (for example when overwriting
+                existing versions). If ``False``, the upload is performed without
+                interactive prompts.
+            gallery_images: Optional collection of image identifiers or file names
+                that should be marked or displayed as gallery images for the dataset
+                on the platform. These are forwarded as ``gallery_image_names`` to
+                the platform API.
+            distribution_task_names: Optional list of task names associated with the
+                dataset that should be considered when configuring how the dataset is
+                distributed or exposed on the platform.
+            cfg: Optional :class:`hafnia_cli.config.Config` instance providing
+                configuration for platform access and storage. If not supplied, a
+                default configuration is created.
+        Returns:
+            dict: The response returned by the platform after uploading the dataset
+            details. The exact contents depend on the platform API but typically
+            include information about the created or updated dataset (such as
+            identifiers and status).
+        """
+        from hafnia.dataset.dataset_details_uploader import upload_dataset_details_to_platform
+        from hafnia.dataset.operations.dataset_s3_storage import sync_dataset_files_to_platform
+        from hafnia.platform.datasets import get_or_create_dataset
+        cfg = cfg or Config()
+        get_or_create_dataset(dataset.info.dataset_name, cfg=cfg)
+        sync_dataset_files_to_platform(
+            dataset=dataset,
+            sample_dataset=dataset_sample,
+            interactive=interactive,
+            allow_version_overwrite=allow_version_overwrite,
+            cfg=cfg,
+        )
+        response = upload_dataset_details_to_platform(
+            dataset=dataset,
+            distribution_task_names=distribution_task_names,
+            gallery_image_names=gallery_images,
+            cfg=cfg,
+        )
+        return response
     def __eq__(self, value) -> bool:
         if not isinstance(value, HafniaDataset):

{hafnia-0.4.3 → hafnia-0.5.0}/src/hafnia/dataset/hafnia_dataset_types.py RENAMED Viewed

@@ -51,7 +51,7 @@ class TaskInfo(BaseModel):
         return self.class_names.index(class_name)
     # The 'primitive'-field of type 'Type[Primitive]' is not supported by pydantic out-of-the-box as
-    # the 'Primitive' class is an abstract base class and for the actual primtives such as Bbox, Bitmask, Classification.
+    # the 'Primitive' class is an abstract base class and for the actual primitives such as Bbox, Bitmask, Classification.
     # Below magic functions ('ensure_primitive' and 'serialize_primitive') ensures that the 'primitive' field can
     # correctly validate and serialize sub-classes (Bbox, Classification, ...).
     @field_validator("primitive", mode="plain")
@@ -103,6 +103,8 @@ class TaskInfo(BaseModel):
 class DatasetInfo(BaseModel):
     dataset_name: str = Field(description="Name of the dataset, e.g. 'coco'")
     version: Optional[str] = Field(default=None, description="Version of the dataset")
+    dataset_title: Optional[str] = Field(default=None, description="Optional, human-readable title of the dataset")
+    description: Optional[str] = Field(default=None, description="Optional, description of the dataset")
     tasks: List[TaskInfo] = Field(default=None, description="List of tasks in the dataset")
     reference_bibtex: Optional[str] = Field(
         default=None,

hafnia 0.4.3__tar.gz → 0.5.0__tar.gz

hafnia 0.4.3tar.gz → 0.5.0tar.gz