hafnia 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hafnia/dataset/dataset_details_uploader.py +41 -54
- hafnia/dataset/dataset_helpers.py +60 -16
- hafnia/dataset/dataset_names.py +1 -94
- hafnia/dataset/dataset_recipe/dataset_recipe.py +48 -4
- hafnia/dataset/format_conversions/torchvision_datasets.py +8 -5
- hafnia/dataset/hafnia_dataset.py +261 -92
- hafnia/dataset/hafnia_dataset_types.py +145 -19
- hafnia/dataset/operations/dataset_s3_storage.py +216 -0
- hafnia/dataset/operations/table_transformations.py +2 -19
- hafnia/http.py +2 -1
- hafnia/platform/datasets.py +144 -153
- hafnia/platform/download.py +1 -1
- hafnia/platform/s5cmd_utils.py +266 -0
- hafnia/utils.py +4 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/METADATA +3 -3
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/RECORD +22 -20
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/WHEEL +1 -1
- hafnia_cli/dataset_cmds.py +36 -12
- hafnia_cli/profile_cmds.py +0 -1
- hafnia_cli/runc_cmds.py +7 -2
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/entry_points.txt +0 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/licenses/LICENSE +0 -0
hafnia/dataset/hafnia_dataset.py
CHANGED
|
@@ -10,15 +10,12 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
|
10
10
|
import polars as pl
|
|
11
11
|
from packaging.version import Version
|
|
12
12
|
|
|
13
|
+
from hafnia import utils
|
|
13
14
|
from hafnia.dataset import dataset_helpers
|
|
15
|
+
from hafnia.dataset.dataset_helpers import is_valid_version_string, version_from_string
|
|
14
16
|
from hafnia.dataset.dataset_names import (
|
|
15
|
-
DATASET_FILENAMES_REQUIRED,
|
|
16
|
-
FILENAME_ANNOTATIONS_JSONL,
|
|
17
|
-
FILENAME_ANNOTATIONS_PARQUET,
|
|
18
|
-
FILENAME_DATASET_INFO,
|
|
19
17
|
FILENAME_RECIPE_JSON,
|
|
20
18
|
TAG_IS_SAMPLE,
|
|
21
|
-
AwsCredentials,
|
|
22
19
|
PrimitiveField,
|
|
23
20
|
SampleField,
|
|
24
21
|
SplitName,
|
|
@@ -29,7 +26,7 @@ from hafnia.dataset.format_conversions import (
|
|
|
29
26
|
format_image_classification_folder,
|
|
30
27
|
format_yolo,
|
|
31
28
|
)
|
|
32
|
-
from hafnia.dataset.hafnia_dataset_types import DatasetInfo, Sample
|
|
29
|
+
from hafnia.dataset.hafnia_dataset_types import DatasetInfo, DatasetMetadataFilePaths, Sample
|
|
33
30
|
from hafnia.dataset.operations import (
|
|
34
31
|
dataset_stats,
|
|
35
32
|
dataset_transformations,
|
|
@@ -37,7 +34,11 @@ from hafnia.dataset.operations import (
|
|
|
37
34
|
)
|
|
38
35
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
39
36
|
from hafnia.log import user_logger
|
|
37
|
+
from hafnia.platform import s5cmd_utils
|
|
38
|
+
from hafnia.platform.datasets import get_read_credentials_by_name
|
|
39
|
+
from hafnia.platform.s5cmd_utils import AwsCredentials, ResourceCredentials
|
|
40
40
|
from hafnia.utils import progress_bar
|
|
41
|
+
from hafnia_cli.config import Config
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
@dataclass
|
|
@@ -89,10 +90,11 @@ class HafniaDataset:
|
|
|
89
90
|
@staticmethod
|
|
90
91
|
def from_path(path_folder: Path, check_for_images: bool = True) -> "HafniaDataset":
|
|
91
92
|
path_folder = Path(path_folder)
|
|
92
|
-
|
|
93
|
+
metadata_file_paths = DatasetMetadataFilePaths.from_path(path_folder)
|
|
94
|
+
metadata_file_paths.exists(raise_error=True)
|
|
93
95
|
|
|
94
|
-
dataset_info = DatasetInfo.from_json_file(
|
|
95
|
-
samples =
|
|
96
|
+
dataset_info = DatasetInfo.from_json_file(Path(metadata_file_paths.dataset_info))
|
|
97
|
+
samples = metadata_file_paths.read_samples()
|
|
96
98
|
samples, dataset_info = _dataset_corrections(samples, dataset_info)
|
|
97
99
|
|
|
98
100
|
# Convert from relative paths to absolute paths
|
|
@@ -103,14 +105,24 @@ class HafniaDataset:
|
|
|
103
105
|
return HafniaDataset(samples=samples, info=dataset_info)
|
|
104
106
|
|
|
105
107
|
@staticmethod
|
|
106
|
-
def from_name(
|
|
108
|
+
def from_name(
|
|
109
|
+
name: str,
|
|
110
|
+
version: Optional[str] = None,
|
|
111
|
+
force_redownload: bool = False,
|
|
112
|
+
download_files: bool = True,
|
|
113
|
+
) -> "HafniaDataset":
|
|
107
114
|
"""
|
|
108
115
|
Load a dataset by its name. The dataset must be registered in the Hafnia platform.
|
|
109
116
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
117
|
+
if ":" in name:
|
|
118
|
+
name, version = dataset_helpers.dataset_name_and_version_from_string(name)
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"The 'from_name' does not support the 'name:version' format. Please provide the version separately.\n"
|
|
121
|
+
f"E.g., HafniaDataset.from_name(name='{name}', version='{version}')"
|
|
122
|
+
)
|
|
112
123
|
dataset_path = download_or_get_dataset_path(
|
|
113
124
|
dataset_name=name,
|
|
125
|
+
version=version,
|
|
114
126
|
force_redownload=force_redownload,
|
|
115
127
|
download_files=download_files,
|
|
116
128
|
)
|
|
@@ -434,7 +446,7 @@ class HafniaDataset:
|
|
|
434
446
|
aws_credentials: AwsCredentials,
|
|
435
447
|
force_redownload: bool = False,
|
|
436
448
|
) -> HafniaDataset:
|
|
437
|
-
from hafnia.platform.
|
|
449
|
+
from hafnia.platform.s5cmd_utils import fast_copy_files
|
|
438
450
|
|
|
439
451
|
remote_src_paths = dataset.samples[SampleField.REMOTE_PATH].unique().to_list()
|
|
440
452
|
update_rows = []
|
|
@@ -470,7 +482,7 @@ class HafniaDataset:
|
|
|
470
482
|
return dataset
|
|
471
483
|
|
|
472
484
|
environment_vars = aws_credentials.aws_credentials()
|
|
473
|
-
|
|
485
|
+
fast_copy_files(
|
|
474
486
|
src_paths=remote_src_paths,
|
|
475
487
|
dst_paths=local_dst_paths,
|
|
476
488
|
append_envs=environment_vars,
|
|
@@ -523,30 +535,6 @@ class HafniaDataset:
|
|
|
523
535
|
table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
|
|
524
536
|
return table_transformations.has_primitive(table, PrimitiveType)
|
|
525
537
|
|
|
526
|
-
@staticmethod
|
|
527
|
-
def check_dataset_path(path_dataset: Path, raise_error: bool = True) -> bool:
|
|
528
|
-
"""
|
|
529
|
-
Checks if the dataset path exists and contains the required files.
|
|
530
|
-
Returns True if the dataset is valid, otherwise raises an error or returns False.
|
|
531
|
-
"""
|
|
532
|
-
if not path_dataset.exists():
|
|
533
|
-
if raise_error:
|
|
534
|
-
raise FileNotFoundError(f"Dataset path {path_dataset} does not exist.")
|
|
535
|
-
return False
|
|
536
|
-
|
|
537
|
-
required_files = [
|
|
538
|
-
FILENAME_DATASET_INFO,
|
|
539
|
-
FILENAME_ANNOTATIONS_JSONL,
|
|
540
|
-
FILENAME_ANNOTATIONS_PARQUET,
|
|
541
|
-
]
|
|
542
|
-
for filename in required_files:
|
|
543
|
-
if not (path_dataset / filename).exists():
|
|
544
|
-
if raise_error:
|
|
545
|
-
raise FileNotFoundError(f"Required file {filename} not found in {path_dataset}.")
|
|
546
|
-
return False
|
|
547
|
-
|
|
548
|
-
return True
|
|
549
|
-
|
|
550
538
|
def copy(self) -> "HafniaDataset":
|
|
551
539
|
return HafniaDataset(info=self.info.model_copy(deep=True), samples=self.samples.clone())
|
|
552
540
|
|
|
@@ -563,7 +551,7 @@ class HafniaDataset:
|
|
|
563
551
|
keep_sample_data=keep_sample_data,
|
|
564
552
|
)
|
|
565
553
|
|
|
566
|
-
def write(self, path_folder: Path,
|
|
554
|
+
def write(self, path_folder: Path, drop_null_cols: bool = True) -> None:
|
|
567
555
|
user_logger.info(f"Writing dataset to {path_folder}...")
|
|
568
556
|
path_folder = path_folder.absolute()
|
|
569
557
|
if not path_folder.exists():
|
|
@@ -578,44 +566,124 @@ class HafniaDataset:
|
|
|
578
566
|
)
|
|
579
567
|
new_paths.append(str(new_path))
|
|
580
568
|
hafnia_dataset.samples = hafnia_dataset.samples.with_columns(pl.Series(new_paths).alias(SampleField.FILE_PATH))
|
|
581
|
-
hafnia_dataset.write_annotations(
|
|
582
|
-
path_folder=path_folder,
|
|
583
|
-
drop_null_cols=drop_null_cols,
|
|
584
|
-
add_version=add_version,
|
|
585
|
-
)
|
|
569
|
+
hafnia_dataset.write_annotations(path_folder=path_folder, drop_null_cols=drop_null_cols)
|
|
586
570
|
|
|
587
|
-
def write_annotations(
|
|
588
|
-
dataset: HafniaDataset,
|
|
589
|
-
path_folder: Path,
|
|
590
|
-
drop_null_cols: bool = True,
|
|
591
|
-
add_version: bool = False,
|
|
592
|
-
) -> None:
|
|
571
|
+
def write_annotations(dataset: HafniaDataset, path_folder: Path, drop_null_cols: bool = True) -> None:
|
|
593
572
|
"""
|
|
594
573
|
Writes only the annotations files (JSONL and Parquet) to the specified folder.
|
|
595
574
|
"""
|
|
575
|
+
|
|
596
576
|
user_logger.info(f"Writing dataset annotations to {path_folder}...")
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
dataset.info.write_json(
|
|
577
|
+
metadata_file_paths = DatasetMetadataFilePaths.from_path(path_folder)
|
|
578
|
+
path_dataset_info = Path(metadata_file_paths.dataset_info)
|
|
579
|
+
path_dataset_info.parent.mkdir(parents=True, exist_ok=True)
|
|
580
|
+
dataset.info.write_json(path_dataset_info)
|
|
601
581
|
|
|
602
582
|
samples = dataset.samples
|
|
603
583
|
if drop_null_cols: # Drops all unused/Null columns
|
|
604
584
|
samples = samples.drop(pl.selectors.by_dtype(pl.Null))
|
|
605
585
|
|
|
586
|
+
path_folder = path_folder.absolute()
|
|
606
587
|
# Store only relative paths in the annotations files
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
588
|
+
if SampleField.FILE_PATH in samples.columns: # We drop column for remote datasets
|
|
589
|
+
absolute_paths = samples[SampleField.FILE_PATH].to_list()
|
|
590
|
+
relative_paths = [str(Path(path).relative_to(path_folder)) for path in absolute_paths]
|
|
591
|
+
samples = samples.with_columns(pl.Series(relative_paths).alias(SampleField.FILE_PATH))
|
|
592
|
+
else:
|
|
593
|
+
samples = samples.with_columns(pl.lit("").alias(SampleField.FILE_PATH))
|
|
594
|
+
|
|
595
|
+
if metadata_file_paths.annotations_jsonl:
|
|
596
|
+
samples.write_ndjson(Path(metadata_file_paths.annotations_jsonl)) # Json for readability
|
|
597
|
+
if metadata_file_paths.annotations_parquet:
|
|
598
|
+
samples.write_parquet(Path(metadata_file_paths.annotations_parquet)) # Parquet for speed
|
|
599
|
+
|
|
600
|
+
def delete_on_platform(dataset: HafniaDataset, interactive: bool = True) -> None:
|
|
601
|
+
"""
|
|
602
|
+
Delete this dataset from the Hafnia platform.
|
|
603
|
+
This is a thin wrapper around `hafnia.platform.datasets.delete_dataset_completely_by_name`.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
dataset (HafniaDataset): The :class:`HafniaDataset` instance to delete from the platform. The
|
|
607
|
+
dataset name is taken from `dataset.info.dataset_name`.
|
|
608
|
+
interactive (bool): If ``True``, perform the deletion in interactive mode (for example,
|
|
609
|
+
prompting the user for confirmation where supported). If ``False``,
|
|
610
|
+
run non-interactively, suitable for automated scripts or CI usage. Defaults to True.
|
|
611
|
+
"""
|
|
612
|
+
from hafnia.platform.datasets import delete_dataset_completely_by_name
|
|
613
|
+
|
|
614
|
+
delete_dataset_completely_by_name(dataset_name=dataset.info.dataset_name, interactive=interactive)
|
|
615
|
+
|
|
616
|
+
def upload_to_platform(
|
|
617
|
+
dataset: HafniaDataset,
|
|
618
|
+
dataset_sample: Optional[HafniaDataset] = None,
|
|
619
|
+
allow_version_overwrite: bool = False,
|
|
620
|
+
interactive: bool = True,
|
|
621
|
+
gallery_images: Optional[Any] = None,
|
|
622
|
+
distribution_task_names: Optional[List[str]] = None,
|
|
623
|
+
cfg: Optional[Config] = None,
|
|
624
|
+
) -> dict:
|
|
625
|
+
"""
|
|
626
|
+
Upload the dataset and dataset details to the Hafnia platform.
|
|
627
|
+
This method ensures the dataset exists on the platform, synchronizes the
|
|
628
|
+
dataset files to remote storage, and uploads dataset details and optional gallery images
|
|
629
|
+
distributions.
|
|
630
|
+
Args:
|
|
631
|
+
dataset: The full :class:`HafniaDataset` instance that should be uploaded
|
|
632
|
+
to the platform.
|
|
633
|
+
dataset_sample: Optional sample :class:`HafniaDataset` used as a smaller
|
|
634
|
+
preview or subset of the main dataset on the platform. If provided,
|
|
635
|
+
it is uploaded alongside the full dataset for demonstration or
|
|
636
|
+
inspection purposes. Use only this if the sample dataset uses different
|
|
637
|
+
image files than the main dataset. Otherwise it is sufficient to just provide
|
|
638
|
+
the main dataset and the platform will create a sample automatically.
|
|
639
|
+
allow_version_overwrite: If ``True``, allows an existing dataset version
|
|
640
|
+
with the same name to be overwritten on the platform. If ``False``,
|
|
641
|
+
an error or confirmation may be required when a version conflict is
|
|
642
|
+
detected.
|
|
643
|
+
interactive: If ``True``, the upload process may prompt the user for
|
|
644
|
+
confirmation or additional input (for example when overwriting
|
|
645
|
+
existing versions). If ``False``, the upload is performed without
|
|
646
|
+
interactive prompts.
|
|
647
|
+
gallery_images: Optional collection of image identifiers or file names
|
|
648
|
+
that should be marked or displayed as gallery images for the dataset
|
|
649
|
+
on the platform. These are forwarded as ``gallery_image_names`` to
|
|
650
|
+
the platform API.
|
|
651
|
+
distribution_task_names: Optional list of task names associated with the
|
|
652
|
+
dataset that should be considered when configuring how the dataset is
|
|
653
|
+
distributed or exposed on the platform.
|
|
654
|
+
cfg: Optional :class:`hafnia_cli.config.Config` instance providing
|
|
655
|
+
configuration for platform access and storage. If not supplied, a
|
|
656
|
+
default configuration is created.
|
|
657
|
+
Returns:
|
|
658
|
+
dict: The response returned by the platform after uploading the dataset
|
|
659
|
+
details. The exact contents depend on the platform API but typically
|
|
660
|
+
include information about the created or updated dataset (such as
|
|
661
|
+
identifiers and status).
|
|
662
|
+
"""
|
|
663
|
+
|
|
664
|
+
from hafnia.dataset.dataset_details_uploader import upload_dataset_details_to_platform
|
|
665
|
+
from hafnia.dataset.operations.dataset_s3_storage import sync_dataset_files_to_platform
|
|
666
|
+
from hafnia.platform.datasets import get_or_create_dataset
|
|
667
|
+
|
|
668
|
+
cfg = cfg or Config()
|
|
669
|
+
get_or_create_dataset(dataset.info.dataset_name, cfg=cfg)
|
|
670
|
+
|
|
671
|
+
sync_dataset_files_to_platform(
|
|
672
|
+
dataset=dataset,
|
|
673
|
+
sample_dataset=dataset_sample,
|
|
674
|
+
interactive=interactive,
|
|
675
|
+
allow_version_overwrite=allow_version_overwrite,
|
|
676
|
+
cfg=cfg,
|
|
677
|
+
)
|
|
610
678
|
|
|
611
|
-
|
|
612
|
-
|
|
679
|
+
response = upload_dataset_details_to_platform(
|
|
680
|
+
dataset=dataset,
|
|
681
|
+
distribution_task_names=distribution_task_names,
|
|
682
|
+
gallery_image_names=gallery_images,
|
|
683
|
+
cfg=cfg,
|
|
684
|
+
)
|
|
613
685
|
|
|
614
|
-
|
|
615
|
-
path_version = path_folder / "versions" / f"{dataset.info.version}"
|
|
616
|
-
path_version.mkdir(parents=True, exist_ok=True)
|
|
617
|
-
for filename in DATASET_FILENAMES_REQUIRED:
|
|
618
|
-
shutil.copy2(path_folder / filename, path_version / filename)
|
|
686
|
+
return response
|
|
619
687
|
|
|
620
688
|
def __eq__(self, value) -> bool:
|
|
621
689
|
if not isinstance(value, HafniaDataset):
|
|
@@ -632,6 +700,42 @@ class HafniaDataset:
|
|
|
632
700
|
return True
|
|
633
701
|
|
|
634
702
|
|
|
703
|
+
def _dataset_corrections(samples: pl.DataFrame, dataset_info: DatasetInfo) -> Tuple[pl.DataFrame, DatasetInfo]:
|
|
704
|
+
format_version_of_dataset = Version(dataset_info.format_version)
|
|
705
|
+
|
|
706
|
+
## Backwards compatibility fixes for older dataset versions
|
|
707
|
+
if format_version_of_dataset < Version("0.2.0"):
|
|
708
|
+
samples = table_transformations.add_dataset_name_if_missing(samples, dataset_info.dataset_name)
|
|
709
|
+
|
|
710
|
+
if "file_name" in samples.columns:
|
|
711
|
+
samples = samples.rename({"file_name": SampleField.FILE_PATH})
|
|
712
|
+
|
|
713
|
+
if SampleField.SAMPLE_INDEX not in samples.columns:
|
|
714
|
+
samples = table_transformations.add_sample_index(samples)
|
|
715
|
+
|
|
716
|
+
# Backwards compatibility: If tags-column doesn't exist, create it with empty lists
|
|
717
|
+
if SampleField.TAGS not in samples.columns:
|
|
718
|
+
tags_column: List[List[str]] = [[] for _ in range(len(samples))] # type: ignore[annotation-unchecked]
|
|
719
|
+
samples = samples.with_columns(pl.Series(tags_column, dtype=pl.List(pl.String)).alias(SampleField.TAGS))
|
|
720
|
+
|
|
721
|
+
if SampleField.STORAGE_FORMAT not in samples.columns:
|
|
722
|
+
samples = samples.with_columns(pl.lit(StorageFormat.IMAGE).alias(SampleField.STORAGE_FORMAT))
|
|
723
|
+
|
|
724
|
+
if SampleField.SAMPLE_INDEX in samples.columns and samples[SampleField.SAMPLE_INDEX].dtype != pl.UInt64:
|
|
725
|
+
samples = samples.cast({SampleField.SAMPLE_INDEX: pl.UInt64})
|
|
726
|
+
|
|
727
|
+
if format_version_of_dataset <= Version("0.2.0"):
|
|
728
|
+
if SampleField.BITMASKS in samples.columns and samples[SampleField.BITMASKS].dtype == pl.List(pl.Struct):
|
|
729
|
+
struct_schema = samples.schema[SampleField.BITMASKS].inner
|
|
730
|
+
struct_names = [f.name for f in struct_schema.fields]
|
|
731
|
+
if "rleString" in struct_names:
|
|
732
|
+
struct_names[struct_names.index("rleString")] = "rle_string"
|
|
733
|
+
samples = samples.with_columns(
|
|
734
|
+
pl.col(SampleField.BITMASKS).list.eval(pl.element().struct.rename_fields(struct_names))
|
|
735
|
+
)
|
|
736
|
+
return samples, dataset_info
|
|
737
|
+
|
|
738
|
+
|
|
635
739
|
def check_hafnia_dataset_from_path(path_dataset: Path) -> None:
|
|
636
740
|
dataset = HafniaDataset.from_path(path_dataset, check_for_images=True)
|
|
637
741
|
dataset.check_dataset()
|
|
@@ -653,7 +757,8 @@ def get_or_create_dataset_path_from_recipe(
|
|
|
653
757
|
if force_redownload:
|
|
654
758
|
shutil.rmtree(path_dataset, ignore_errors=True)
|
|
655
759
|
|
|
656
|
-
|
|
760
|
+
dataset_metadata_files = DatasetMetadataFilePaths.from_path(path_dataset)
|
|
761
|
+
if dataset_metadata_files.exists(raise_error=False):
|
|
657
762
|
return path_dataset
|
|
658
763
|
|
|
659
764
|
path_dataset.mkdir(parents=True, exist_ok=True)
|
|
@@ -666,37 +771,101 @@ def get_or_create_dataset_path_from_recipe(
|
|
|
666
771
|
return path_dataset
|
|
667
772
|
|
|
668
773
|
|
|
669
|
-
def
|
|
670
|
-
|
|
774
|
+
def available_dataset_versions_from_name(dataset_name: str) -> Dict[Version, "DatasetMetadataFilePaths"]:
|
|
775
|
+
credentials: ResourceCredentials = get_read_credentials_by_name(dataset_name=dataset_name)
|
|
776
|
+
return available_dataset_versions(credentials=credentials)
|
|
671
777
|
|
|
672
|
-
## Backwards compatibility fixes for older dataset versions
|
|
673
|
-
if format_version_of_dataset < Version("0.2.0"):
|
|
674
|
-
samples = table_transformations.add_dataset_name_if_missing(samples, dataset_info.dataset_name)
|
|
675
778
|
|
|
676
|
-
|
|
677
|
-
|
|
779
|
+
def available_dataset_versions(
|
|
780
|
+
credentials: ResourceCredentials,
|
|
781
|
+
) -> Dict[Version, "DatasetMetadataFilePaths"]:
|
|
782
|
+
envs = credentials.aws_credentials()
|
|
783
|
+
bucket_prefix_sample_versions = f"{credentials.s3_uri()}/versions"
|
|
784
|
+
all_s3_annotation_files = s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix_sample_versions, append_envs=envs)
|
|
785
|
+
available_versions = DatasetMetadataFilePaths.available_versions_from_files_list(all_s3_annotation_files)
|
|
786
|
+
return available_versions
|
|
678
787
|
|
|
679
|
-
if SampleField.SAMPLE_INDEX not in samples.columns:
|
|
680
|
-
samples = table_transformations.add_sample_index(samples)
|
|
681
788
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
789
|
+
def select_version_from_available_versions(
|
|
790
|
+
available_versions: Dict[Version, "DatasetMetadataFilePaths"],
|
|
791
|
+
version: Optional[str],
|
|
792
|
+
) -> "DatasetMetadataFilePaths":
|
|
793
|
+
if len(available_versions) == 0:
|
|
794
|
+
raise ValueError("No versions were found in the dataset.")
|
|
686
795
|
|
|
687
|
-
|
|
688
|
-
|
|
796
|
+
if version is None:
|
|
797
|
+
str_versions = [str(v) for v in available_versions]
|
|
798
|
+
raise ValueError(f"Version must be specified. Available versions: {str_versions}")
|
|
799
|
+
elif version == "latest":
|
|
800
|
+
version_casted = max(available_versions)
|
|
801
|
+
user_logger.info(f"'latest' version '{version_casted}' has been selected")
|
|
802
|
+
else:
|
|
803
|
+
version_casted = version_from_string(version)
|
|
689
804
|
|
|
690
|
-
|
|
691
|
-
|
|
805
|
+
if version_casted not in available_versions:
|
|
806
|
+
raise ValueError(f"Selected version '{version}' not found in available versions: {available_versions}")
|
|
692
807
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
808
|
+
return available_versions[version_casted]
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def download_meta_dataset_files_from_version(
|
|
812
|
+
resource_credentials: ResourceCredentials, version: Optional[str], path_dataset: Path
|
|
813
|
+
) -> list[str]:
|
|
814
|
+
envs = resource_credentials.aws_credentials()
|
|
815
|
+
available_versions = available_dataset_versions(credentials=resource_credentials)
|
|
816
|
+
metadata_files = select_version_from_available_versions(available_versions=available_versions, version=version)
|
|
817
|
+
|
|
818
|
+
s3_files = metadata_files.as_list()
|
|
819
|
+
path_dataset.mkdir(parents=True, exist_ok=True)
|
|
820
|
+
local_paths = [(path_dataset / filename.split("/")[-1]).as_posix() for filename in s3_files]
|
|
821
|
+
s5cmd_utils.fast_copy_files(
|
|
822
|
+
src_paths=s3_files,
|
|
823
|
+
dst_paths=local_paths,
|
|
824
|
+
append_envs=envs,
|
|
825
|
+
description="Downloading meta dataset files",
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
return local_paths
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def download_or_get_dataset_path(
|
|
832
|
+
dataset_name: str,
|
|
833
|
+
version: Optional[str],
|
|
834
|
+
cfg: Optional[Config] = None,
|
|
835
|
+
path_datasets_folder: Optional[str] = None,
|
|
836
|
+
force_redownload: bool = False,
|
|
837
|
+
download_files: bool = True,
|
|
838
|
+
) -> Path:
|
|
839
|
+
"""Download or get the path of the dataset."""
|
|
840
|
+
|
|
841
|
+
path_datasets = path_datasets_folder or utils.PATH_DATASETS
|
|
842
|
+
path_dataset = Path(path_datasets) / dataset_name
|
|
843
|
+
if not is_valid_version_string(version, allow_none=True, allow_latest=True):
|
|
844
|
+
raise ValueError(
|
|
845
|
+
f"Invalid version string: {version}. Should be a valid version (e.g. '0.1.0'), 'latest' or None."
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Only valid versions (e.g. '0.1.0', '1.0.0') can use local cache. Using either "latest"/None will always redownload
|
|
849
|
+
if is_valid_version_string(version, allow_none=False, allow_latest=False):
|
|
850
|
+
dataset_metadata_files = DatasetMetadataFilePaths.from_path(path_dataset)
|
|
851
|
+
dataset_exists = dataset_metadata_files.exists(version=version, raise_error=False)
|
|
852
|
+
if dataset_exists and not force_redownload:
|
|
853
|
+
user_logger.info("Dataset found locally. Set 'force=True' or add `--force` flag with cli to re-download")
|
|
854
|
+
return path_dataset
|
|
855
|
+
|
|
856
|
+
cfg = cfg or Config()
|
|
857
|
+
resource_credentials = get_read_credentials_by_name(dataset_name=dataset_name, cfg=cfg)
|
|
858
|
+
if resource_credentials is None:
|
|
859
|
+
raise ValueError(f"Failed to get read credentials for dataset '{dataset_name}' from the platform.")
|
|
860
|
+
|
|
861
|
+
download_meta_dataset_files_from_version(
|
|
862
|
+
resource_credentials=resource_credentials, version=version, path_dataset=path_dataset
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
if not download_files:
|
|
866
|
+
return path_dataset
|
|
867
|
+
|
|
868
|
+
dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
|
|
869
|
+
dataset = dataset.download_files_aws(path_dataset, aws_credentials=resource_credentials, force_redownload=True)
|
|
870
|
+
dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
|
|
871
|
+
return path_dataset
|