hafnia 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hafnia/dataset/dataset_details_uploader.py +41 -54
- hafnia/dataset/dataset_helpers.py +60 -16
- hafnia/dataset/dataset_names.py +1 -94
- hafnia/dataset/dataset_recipe/dataset_recipe.py +48 -4
- hafnia/dataset/format_conversions/torchvision_datasets.py +8 -5
- hafnia/dataset/hafnia_dataset.py +261 -92
- hafnia/dataset/hafnia_dataset_types.py +145 -19
- hafnia/dataset/operations/dataset_s3_storage.py +216 -0
- hafnia/dataset/operations/table_transformations.py +2 -19
- hafnia/http.py +2 -1
- hafnia/platform/datasets.py +144 -153
- hafnia/platform/download.py +1 -1
- hafnia/platform/s5cmd_utils.py +266 -0
- hafnia/utils.py +4 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/METADATA +3 -3
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/RECORD +22 -20
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/WHEEL +1 -1
- hafnia_cli/dataset_cmds.py +36 -12
- hafnia_cli/profile_cmds.py +0 -1
- hafnia_cli/runc_cmds.py +7 -2
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/entry_points.txt +0 -0
- {hafnia-0.4.3.dist-info → hafnia-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any, Dict, List, Optional, Type, Union
|
|
@@ -7,12 +8,21 @@ from typing import Any, Dict, List, Optional, Type, Union
|
|
|
7
8
|
import cv2
|
|
8
9
|
import more_itertools
|
|
9
10
|
import numpy as np
|
|
11
|
+
import polars as pl
|
|
10
12
|
from packaging.version import Version
|
|
11
13
|
from PIL import Image
|
|
12
14
|
from pydantic import BaseModel, Field, field_serializer, field_validator
|
|
13
15
|
|
|
14
16
|
import hafnia
|
|
15
|
-
from hafnia.dataset
|
|
17
|
+
from hafnia.dataset import dataset_helpers
|
|
18
|
+
from hafnia.dataset.dataset_helpers import version_from_string
|
|
19
|
+
from hafnia.dataset.dataset_names import (
|
|
20
|
+
FILENAME_ANNOTATIONS_JSONL,
|
|
21
|
+
FILENAME_ANNOTATIONS_PARQUET,
|
|
22
|
+
FILENAME_DATASET_INFO,
|
|
23
|
+
SampleField,
|
|
24
|
+
StorageFormat,
|
|
25
|
+
)
|
|
16
26
|
from hafnia.dataset.primitives import (
|
|
17
27
|
PRIMITIVE_TYPES,
|
|
18
28
|
Bbox,
|
|
@@ -51,7 +61,7 @@ class TaskInfo(BaseModel):
|
|
|
51
61
|
return self.class_names.index(class_name)
|
|
52
62
|
|
|
53
63
|
# The 'primitive'-field of type 'Type[Primitive]' is not supported by pydantic out-of-the-box as
|
|
54
|
-
# the 'Primitive' class is an abstract base class and for the actual
|
|
64
|
+
# the 'Primitive' class is an abstract base class and for the actual primitives such as Bbox, Bitmask, Classification.
|
|
55
65
|
# Below magic functions ('ensure_primitive' and 'serialize_primitive') ensures that the 'primitive' field can
|
|
56
66
|
# correctly validate and serialize sub-classes (Bbox, Classification, ...).
|
|
57
67
|
@field_validator("primitive", mode="plain")
|
|
@@ -102,7 +112,9 @@ class TaskInfo(BaseModel):
|
|
|
102
112
|
|
|
103
113
|
class DatasetInfo(BaseModel):
|
|
104
114
|
dataset_name: str = Field(description="Name of the dataset, e.g. 'coco'")
|
|
105
|
-
version:
|
|
115
|
+
version: str = Field(default="0.0.0", description="Version of the dataset")
|
|
116
|
+
dataset_title: Optional[str] = Field(default=None, description="Optional, human-readable title of the dataset")
|
|
117
|
+
description: Optional[str] = Field(default=None, description="Optional, description of the dataset")
|
|
106
118
|
tasks: List[TaskInfo] = Field(default=None, description="List of tasks in the dataset")
|
|
107
119
|
reference_bibtex: Optional[str] = Field(
|
|
108
120
|
default=None,
|
|
@@ -142,31 +154,21 @@ class DatasetInfo(BaseModel):
|
|
|
142
154
|
@field_validator("format_version")
|
|
143
155
|
@classmethod
|
|
144
156
|
def _validate_format_version(cls, format_version: str) -> str:
|
|
145
|
-
|
|
146
|
-
Version(format_version)
|
|
147
|
-
except Exception as e:
|
|
148
|
-
raise ValueError(f"Invalid format_version '{format_version}'. Must be a valid version string.") from e
|
|
157
|
+
version_casted: Version = dataset_helpers.version_from_string(format_version, raise_error=True)
|
|
149
158
|
|
|
150
|
-
if
|
|
159
|
+
if version_casted > Version(hafnia.__dataset_format_version__):
|
|
151
160
|
user_logger.warning(
|
|
152
161
|
f"The loaded dataset format version '{format_version}' is newer than the format version "
|
|
153
162
|
f"'{hafnia.__dataset_format_version__}' used in your version of Hafnia. Please consider "
|
|
154
163
|
f"updating Hafnia package."
|
|
155
164
|
)
|
|
156
|
-
return
|
|
165
|
+
return str(version_casted)
|
|
157
166
|
|
|
158
167
|
@field_validator("version")
|
|
159
168
|
@classmethod
|
|
160
169
|
def _validate_version(cls, dataset_version: Optional[str]) -> Optional[str]:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
try:
|
|
165
|
-
Version(dataset_version)
|
|
166
|
-
except Exception as e:
|
|
167
|
-
raise ValueError(f"Invalid dataset_version '{dataset_version}'. Must be a valid version string.") from e
|
|
168
|
-
|
|
169
|
-
return dataset_version
|
|
170
|
+
version_casted: Version = dataset_helpers.version_from_string(dataset_version, raise_error=True)
|
|
171
|
+
return str(version_casted)
|
|
170
172
|
|
|
171
173
|
def check_for_duplicate_task_names(self) -> List[TaskInfo]:
|
|
172
174
|
return self._validate_check_for_duplicate_tasks(self.tasks)
|
|
@@ -236,7 +238,7 @@ class DatasetInfo(BaseModel):
|
|
|
236
238
|
meta.update(info1.meta or {})
|
|
237
239
|
return DatasetInfo(
|
|
238
240
|
dataset_name=info0.dataset_name + "+" + info1.dataset_name,
|
|
239
|
-
version=
|
|
241
|
+
version="0.0.0",
|
|
240
242
|
tasks=list(unique_tasks),
|
|
241
243
|
meta=meta,
|
|
242
244
|
format_version=dataset_format_version,
|
|
@@ -475,3 +477,127 @@ class Sample(BaseModel):
|
|
|
475
477
|
annotations = self.get_annotations()
|
|
476
478
|
annotations_visualized = image_visualizations.draw_annotations(image=image, primitives=annotations)
|
|
477
479
|
return annotations_visualized
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
@dataclass
|
|
483
|
+
class DatasetMetadataFilePaths:
|
|
484
|
+
dataset_info: str # Use 'str' to also support s3 paths
|
|
485
|
+
annotations_jsonl: Optional[str]
|
|
486
|
+
annotations_parquet: Optional[str]
|
|
487
|
+
|
|
488
|
+
def as_list(self) -> List[str]:
|
|
489
|
+
files = [self.dataset_info]
|
|
490
|
+
if self.annotations_jsonl is not None:
|
|
491
|
+
files.append(self.annotations_jsonl)
|
|
492
|
+
if self.annotations_parquet is not None:
|
|
493
|
+
files.append(self.annotations_parquet)
|
|
494
|
+
return files
|
|
495
|
+
|
|
496
|
+
def read_samples(self) -> pl.DataFrame:
|
|
497
|
+
if self.annotations_parquet is not None:
|
|
498
|
+
if not Path(self.annotations_parquet).exists():
|
|
499
|
+
raise FileNotFoundError(f"Parquet annotations file '{self.annotations_parquet}' does not exist.")
|
|
500
|
+
user_logger.info(f"Reading dataset annotations from Parquet file: {self.annotations_parquet}")
|
|
501
|
+
return pl.read_parquet(self.annotations_parquet)
|
|
502
|
+
|
|
503
|
+
if self.annotations_jsonl is not None:
|
|
504
|
+
if not Path(self.annotations_jsonl).exists():
|
|
505
|
+
raise FileNotFoundError(f"JSONL annotations file '{self.annotations_jsonl}' does not exist.")
|
|
506
|
+
user_logger.info(f"Reading dataset annotations from JSONL file: {self.annotations_jsonl}")
|
|
507
|
+
return pl.read_ndjson(self.annotations_jsonl)
|
|
508
|
+
|
|
509
|
+
raise ValueError(
|
|
510
|
+
"No annotations file available to read samples from. Dataset is missing both JSONL and Parquet files."
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def from_path(path_dataset: Path) -> "DatasetMetadataFilePaths":
|
|
515
|
+
path_dataset = path_dataset.absolute()
|
|
516
|
+
metadata_files = DatasetMetadataFilePaths(
|
|
517
|
+
dataset_info=str(path_dataset / FILENAME_DATASET_INFO),
|
|
518
|
+
annotations_jsonl=str(path_dataset / FILENAME_ANNOTATIONS_JSONL),
|
|
519
|
+
annotations_parquet=str(path_dataset / FILENAME_ANNOTATIONS_PARQUET),
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
return metadata_files
|
|
523
|
+
|
|
524
|
+
@staticmethod
|
|
525
|
+
def available_versions_from_files_list(files: list[str]) -> Dict[Version, "DatasetMetadataFilePaths"]:
|
|
526
|
+
versions_and_files: Dict[Version, Dict[str, str]] = collections.defaultdict(dict)
|
|
527
|
+
for metadata_file in files:
|
|
528
|
+
version_str, filename = metadata_file.split("/")[-2:]
|
|
529
|
+
versions_and_files[version_str][filename] = metadata_file
|
|
530
|
+
|
|
531
|
+
available_versions: Dict[Version, DatasetMetadataFilePaths] = {}
|
|
532
|
+
for version_str, version_files in versions_and_files.items():
|
|
533
|
+
version_casted: Version = dataset_helpers.version_from_string(version_str, raise_error=False)
|
|
534
|
+
if version_casted is None:
|
|
535
|
+
continue
|
|
536
|
+
|
|
537
|
+
if FILENAME_DATASET_INFO not in version_files:
|
|
538
|
+
continue
|
|
539
|
+
dataset_metadata_file = DatasetMetadataFilePaths(
|
|
540
|
+
dataset_info=version_files[FILENAME_DATASET_INFO],
|
|
541
|
+
annotations_jsonl=version_files.get(FILENAME_ANNOTATIONS_JSONL, None),
|
|
542
|
+
annotations_parquet=version_files.get(FILENAME_ANNOTATIONS_PARQUET, None),
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
available_versions[version_casted] = dataset_metadata_file
|
|
546
|
+
|
|
547
|
+
return available_versions
|
|
548
|
+
|
|
549
|
+
def check_version(self, version: str, raise_error: bool = True) -> bool:
|
|
550
|
+
"""
|
|
551
|
+
Check if the dataset metadata files match the given version.
|
|
552
|
+
If raise_error is True, raises ValueError if the version does not match.
|
|
553
|
+
"""
|
|
554
|
+
valid_version = version_from_string(version, raise_error=raise_error)
|
|
555
|
+
if valid_version is None:
|
|
556
|
+
return False
|
|
557
|
+
|
|
558
|
+
path_dataset_info = Path(self.dataset_info)
|
|
559
|
+
if not path_dataset_info.exists():
|
|
560
|
+
raise FileNotFoundError(f"Dataset info file missing '{self.dataset_info}' in dataset folder.")
|
|
561
|
+
|
|
562
|
+
dataset_info = json.loads(path_dataset_info.read_text())
|
|
563
|
+
dataset_version = dataset_info.get("version", None)
|
|
564
|
+
if dataset_version != version:
|
|
565
|
+
if raise_error:
|
|
566
|
+
raise ValueError(
|
|
567
|
+
f"Dataset version mismatch. Expected version '{version}' but found "
|
|
568
|
+
f"version '{dataset_version}' in dataset info."
|
|
569
|
+
)
|
|
570
|
+
return False
|
|
571
|
+
|
|
572
|
+
return True
|
|
573
|
+
|
|
574
|
+
def exists(self, version: Optional[str] = None, raise_error: bool = True) -> bool:
|
|
575
|
+
"""
|
|
576
|
+
Check if all metadata files exist.
|
|
577
|
+
Add version to check if it matches the version in dataset info.
|
|
578
|
+
If raise_error is True, raises FileNotFoundError if any file is missing.
|
|
579
|
+
"""
|
|
580
|
+
path_dataset_info = Path(self.dataset_info)
|
|
581
|
+
if not path_dataset_info.exists():
|
|
582
|
+
if raise_error:
|
|
583
|
+
raise FileNotFoundError(f"Dataset info file missing '{self.dataset_info}' in dataset folder.")
|
|
584
|
+
return False
|
|
585
|
+
|
|
586
|
+
if version is not None and self.check_version(version, raise_error=raise_error) is False:
|
|
587
|
+
return False
|
|
588
|
+
|
|
589
|
+
has_jsonl_file = self.annotations_jsonl is not None and Path(self.annotations_jsonl).exists()
|
|
590
|
+
if has_jsonl_file:
|
|
591
|
+
return True
|
|
592
|
+
|
|
593
|
+
has_parquet_file = self.annotations_parquet is not None and Path(self.annotations_parquet).exists()
|
|
594
|
+
if has_parquet_file:
|
|
595
|
+
return True
|
|
596
|
+
|
|
597
|
+
if raise_error:
|
|
598
|
+
raise FileNotFoundError(
|
|
599
|
+
f"Missing annotation file. Expected either '{FILENAME_ANNOTATIONS_JSONL}' or "
|
|
600
|
+
f"'{FILENAME_ANNOTATIONS_PARQUET}' in dataset folder."
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
return False
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from hafnia.dataset.dataset_helpers import hash_file_xxhash
|
|
9
|
+
from hafnia.dataset.dataset_names import (
|
|
10
|
+
DatasetVariant,
|
|
11
|
+
SampleField,
|
|
12
|
+
)
|
|
13
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
14
|
+
from hafnia.log import user_logger
|
|
15
|
+
from hafnia.platform import s5cmd_utils
|
|
16
|
+
from hafnia.platform.datasets import get_upload_credentials
|
|
17
|
+
from hafnia.platform.s5cmd_utils import ResourceCredentials
|
|
18
|
+
from hafnia.utils import progress_bar
|
|
19
|
+
from hafnia_cli.config import Config
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def delete_hafnia_dataset_files_on_platform(
|
|
23
|
+
dataset_name: str,
|
|
24
|
+
interactive: bool = True,
|
|
25
|
+
cfg: Optional[Config] = None,
|
|
26
|
+
) -> bool:
|
|
27
|
+
cfg = cfg or Config()
|
|
28
|
+
resource_credentials = get_upload_credentials(dataset_name, cfg=cfg)
|
|
29
|
+
|
|
30
|
+
if resource_credentials is None:
|
|
31
|
+
raise RuntimeError("Failed to get upload credentials from the platform.")
|
|
32
|
+
|
|
33
|
+
return delete_hafnia_dataset_files_from_resource_credentials(
|
|
34
|
+
interactive=interactive,
|
|
35
|
+
resource_credentials=resource_credentials,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def delete_hafnia_dataset_files_from_resource_credentials(
|
|
40
|
+
resource_credentials: ResourceCredentials,
|
|
41
|
+
interactive: bool = True,
|
|
42
|
+
remove_bucket: bool = True,
|
|
43
|
+
) -> bool:
|
|
44
|
+
envs = resource_credentials.aws_credentials()
|
|
45
|
+
bucket_name = resource_credentials.bucket_name()
|
|
46
|
+
if interactive:
|
|
47
|
+
confirmation = (
|
|
48
|
+
input(
|
|
49
|
+
f"WARNING THIS WILL delete all files stored in 's3://{bucket_name}'.\n"
|
|
50
|
+
"Meaning that all previous versions of the dataset will be deleted. \n"
|
|
51
|
+
"Normally this is not needed, but if you have changed the dataset structure or want to start from fresh, "
|
|
52
|
+
"you can delete all files in the S3 bucket. "
|
|
53
|
+
"\nDo you really want to delete all files? (yes/NO): "
|
|
54
|
+
)
|
|
55
|
+
.strip()
|
|
56
|
+
.lower()
|
|
57
|
+
)
|
|
58
|
+
if confirmation != "yes":
|
|
59
|
+
user_logger.info("Delete operation cancelled by the user.")
|
|
60
|
+
return False
|
|
61
|
+
user_logger.info(f"Deleting all files in S3 bucket '{bucket_name}'...")
|
|
62
|
+
s5cmd_utils.delete_bucket_content(
|
|
63
|
+
bucket_prefix=f"s3://{bucket_name}",
|
|
64
|
+
remove_bucket=remove_bucket,
|
|
65
|
+
append_envs=envs,
|
|
66
|
+
)
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def sync_hafnia_dataset_to_s3(
|
|
71
|
+
dataset: HafniaDataset,
|
|
72
|
+
bucket_prefix: str,
|
|
73
|
+
allow_version_overwrite: bool = False,
|
|
74
|
+
interactive: bool = True,
|
|
75
|
+
envs: Optional[Dict[str, str]] = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
t0 = time.time()
|
|
78
|
+
# bucket_prefix e.g. 's3://bucket-name/sample'
|
|
79
|
+
remote_paths = []
|
|
80
|
+
for file_str in progress_bar(dataset.samples[SampleField.FILE_PATH], description="Hashing data files"):
|
|
81
|
+
path_file = Path(file_str)
|
|
82
|
+
file_hash = hash_file_xxhash(path_file)
|
|
83
|
+
|
|
84
|
+
# Relative path in S3 bucket e.g. 'data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
|
|
85
|
+
relative_path = s3_prefix_from_hash(hash=file_hash, suffix=path_file.suffix)
|
|
86
|
+
|
|
87
|
+
# Remote path in S3 bucket e.g. 's3://bucket-name/sample/data/e2/b0/e2b000ac47b19a999bee5456a6addb88.png'
|
|
88
|
+
remote_path = f"{bucket_prefix}/{relative_path}"
|
|
89
|
+
remote_paths.append(remote_path)
|
|
90
|
+
|
|
91
|
+
dataset.samples = dataset.samples.with_columns(pl.Series(remote_paths).alias(SampleField.REMOTE_PATH))
|
|
92
|
+
|
|
93
|
+
user_logger.info(f"Syncing dataset to S3 bucket '{bucket_prefix}'")
|
|
94
|
+
files_in_s3 = set(s5cmd_utils.list_bucket(bucket_prefix=bucket_prefix, append_envs=envs))
|
|
95
|
+
|
|
96
|
+
# Discover data files (images, videos, etc.) missing in s3
|
|
97
|
+
data_files_missing = dataset.samples.filter(~pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
|
|
98
|
+
files_already_in_s3 = dataset.samples.filter(pl.col(SampleField.REMOTE_PATH).is_in(files_in_s3))
|
|
99
|
+
|
|
100
|
+
with tempfile.TemporaryDirectory() as temp_dir: # Temp folder to store metadata files
|
|
101
|
+
path_temp = Path(temp_dir)
|
|
102
|
+
# File paths are dropped when uploading to S3
|
|
103
|
+
dataset = dataset.update_samples(dataset.samples.drop(SampleField.FILE_PATH))
|
|
104
|
+
dataset.write_annotations(path_temp)
|
|
105
|
+
|
|
106
|
+
# Discover versioned metadata files (e.g. "annotations.jsonl", "dataset_info.json") missing in s3
|
|
107
|
+
metadata_files_local = []
|
|
108
|
+
metadata_files_s3 = []
|
|
109
|
+
for filename in path_temp.iterdir():
|
|
110
|
+
metadata_files_s3.append(f"{bucket_prefix}/versions/{dataset.info.version}/{filename.name}")
|
|
111
|
+
metadata_files_local.append(filename.as_posix())
|
|
112
|
+
|
|
113
|
+
overwrite_metadata_files = files_in_s3.intersection(set(metadata_files_s3))
|
|
114
|
+
will_overwrite_metadata_files = len(overwrite_metadata_files) > 0
|
|
115
|
+
|
|
116
|
+
n_files_already_in_s3 = len(files_already_in_s3)
|
|
117
|
+
user_logger.info(f"Sync dataset to {bucket_prefix}")
|
|
118
|
+
user_logger.info(
|
|
119
|
+
f"- Found that {n_files_already_in_s3} / {len(dataset.samples)} data files already exist. "
|
|
120
|
+
f"Meaning {len(data_files_missing)} data files will be uploaded. \n"
|
|
121
|
+
f"- Will upload {len(metadata_files_local)} metadata files. \n"
|
|
122
|
+
f"- Total files to upload: {len(data_files_missing) + len(metadata_files_local)}"
|
|
123
|
+
)
|
|
124
|
+
if will_overwrite_metadata_files:
|
|
125
|
+
msg = f"Metadata files for dataset version '{dataset.info.version}' already exist"
|
|
126
|
+
if allow_version_overwrite:
|
|
127
|
+
user_logger.warning(
|
|
128
|
+
f"- WARNING: {msg}. Version will be overwritten as 'allow_version_overwrite=True' is set."
|
|
129
|
+
)
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Upload cancelled. {msg}. \nTo overwrite existing metadata files, "
|
|
133
|
+
"you will need to set 'allow_version_overwrite=True' explicitly."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
has_missing_files = len(data_files_missing) > 0
|
|
137
|
+
if interactive and (has_missing_files or will_overwrite_metadata_files):
|
|
138
|
+
print("Please type 'yes' to upload files.")
|
|
139
|
+
confirmation = input("Do you want to continue? (yes/NO): ").strip().lower()
|
|
140
|
+
|
|
141
|
+
if confirmation != "yes":
|
|
142
|
+
raise RuntimeError("Upload cancelled by user.")
|
|
143
|
+
|
|
144
|
+
local_paths = metadata_files_local + data_files_missing[SampleField.FILE_PATH].to_list()
|
|
145
|
+
s3_paths = metadata_files_s3 + data_files_missing[SampleField.REMOTE_PATH].to_list()
|
|
146
|
+
s5cmd_utils.fast_copy_files(local_paths, s3_paths, append_envs=envs, description="Uploading files")
|
|
147
|
+
user_logger.info(f"- Synced dataset in {time.time() - t0:.2f} seconds.")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def sync_dataset_files_to_platform(
|
|
151
|
+
dataset: HafniaDataset,
|
|
152
|
+
sample_dataset: Optional[HafniaDataset] = None,
|
|
153
|
+
interactive: bool = True,
|
|
154
|
+
allow_version_overwrite: bool = False,
|
|
155
|
+
cfg: Optional[Config] = None,
|
|
156
|
+
) -> None:
|
|
157
|
+
cfg = cfg or Config()
|
|
158
|
+
resource_credentials = get_upload_credentials(dataset.info.dataset_name, cfg=cfg)
|
|
159
|
+
|
|
160
|
+
if resource_credentials is None:
|
|
161
|
+
raise RuntimeError("Failed to get upload credentials from the platform.")
|
|
162
|
+
|
|
163
|
+
sync_dataset_files_to_platform_from_resource_credentials(
|
|
164
|
+
dataset=dataset,
|
|
165
|
+
sample_dataset=sample_dataset,
|
|
166
|
+
interactive=interactive,
|
|
167
|
+
allow_version_overwrite=allow_version_overwrite,
|
|
168
|
+
resource_credentials=resource_credentials,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def sync_dataset_files_to_platform_from_resource_credentials(
|
|
173
|
+
dataset: HafniaDataset,
|
|
174
|
+
sample_dataset: Optional[HafniaDataset],
|
|
175
|
+
interactive: bool,
|
|
176
|
+
allow_version_overwrite: bool,
|
|
177
|
+
resource_credentials: ResourceCredentials,
|
|
178
|
+
):
|
|
179
|
+
envs = resource_credentials.aws_credentials()
|
|
180
|
+
bucket_name = resource_credentials.bucket_name()
|
|
181
|
+
|
|
182
|
+
for dataset_variant_type in [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]:
|
|
183
|
+
if dataset_variant_type == DatasetVariant.SAMPLE:
|
|
184
|
+
if sample_dataset is None:
|
|
185
|
+
dataset_variant = dataset.create_sample_dataset()
|
|
186
|
+
else:
|
|
187
|
+
dataset_variant = sample_dataset
|
|
188
|
+
else:
|
|
189
|
+
dataset_variant = dataset
|
|
190
|
+
|
|
191
|
+
sync_hafnia_dataset_to_s3(
|
|
192
|
+
dataset=dataset_variant,
|
|
193
|
+
bucket_prefix=f"s3://{bucket_name}/{dataset_variant_type.value}",
|
|
194
|
+
interactive=interactive,
|
|
195
|
+
allow_version_overwrite=allow_version_overwrite,
|
|
196
|
+
envs=envs,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def s3_prefix_from_hash(hash: str, suffix: str) -> str:
|
|
201
|
+
"""
|
|
202
|
+
Generate a relative S3 path from a hash value for objects stored in S3.
|
|
203
|
+
|
|
204
|
+
This function deliberately uses a hierarchical directory layout based on the
|
|
205
|
+
hash prefix to avoid putting too many objects in a single S3 prefix, which
|
|
206
|
+
can run into AWS S3 rate limits and performance issues. For example, for
|
|
207
|
+
hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4" and suffix ".png", the returned
|
|
208
|
+
path will be:
|
|
209
|
+
|
|
210
|
+
"data/df/e8/dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"
|
|
211
|
+
|
|
212
|
+
Note: This intentionally differs from when images are stored to disk locally, where
|
|
213
|
+
a flat path of the form ``data/<hash><suffix>`` is used.
|
|
214
|
+
"""
|
|
215
|
+
s3_prefix = f"data/{hash[:2]}/{hash[2:4]}/{hash}{suffix}"
|
|
216
|
+
return s3_prefix
|
|
@@ -4,8 +4,6 @@ from typing import List, Optional, Tuple, Type
|
|
|
4
4
|
import polars as pl
|
|
5
5
|
|
|
6
6
|
from hafnia.dataset.dataset_names import (
|
|
7
|
-
FILENAME_ANNOTATIONS_JSONL,
|
|
8
|
-
FILENAME_ANNOTATIONS_PARQUET,
|
|
9
7
|
PrimitiveField,
|
|
10
8
|
SampleField,
|
|
11
9
|
)
|
|
@@ -45,7 +43,8 @@ def create_primitive_table(
|
|
|
45
43
|
remove_no_object_frames = remove_no_object_frames.drop(drop_columns_names)
|
|
46
44
|
# Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
|
|
47
45
|
remove_no_object_frames = remove_no_object_frames.rename(
|
|
48
|
-
{"height": "image.height", "width": "image.width", "meta": "image.meta"}
|
|
46
|
+
{"height": "image.height", "width": "image.width", "meta": "image.meta"},
|
|
47
|
+
strict=False,
|
|
49
48
|
)
|
|
50
49
|
objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
|
|
51
50
|
else:
|
|
@@ -203,22 +202,6 @@ def split_primitive_columns_by_task_name(
|
|
|
203
202
|
return samples_table
|
|
204
203
|
|
|
205
204
|
|
|
206
|
-
def read_samples_from_path(path: Path) -> pl.DataFrame:
|
|
207
|
-
path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
|
|
208
|
-
if path_annotations.exists():
|
|
209
|
-
user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
|
|
210
|
-
return pl.read_parquet(path_annotations)
|
|
211
|
-
|
|
212
|
-
path_annotations_jsonl = path / FILENAME_ANNOTATIONS_JSONL
|
|
213
|
-
if path_annotations_jsonl.exists():
|
|
214
|
-
user_logger.info(f"Reading dataset annotations from JSONL file: {path_annotations_jsonl}")
|
|
215
|
-
return pl.read_ndjson(path_annotations_jsonl)
|
|
216
|
-
|
|
217
|
-
raise FileNotFoundError(
|
|
218
|
-
f"Unable to read annotations. No json file '{path_annotations.name}' or Parquet file '{{path_annotations.name}} in in '{path}'."
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
|
|
222
205
|
def check_image_paths(table: pl.DataFrame) -> bool:
|
|
223
206
|
missing_files = []
|
|
224
207
|
org_paths = table[SampleField.FILE_PATH].to_list()
|
hafnia/http.py
CHANGED
|
@@ -24,7 +24,8 @@ def fetch(endpoint: str, headers: Dict, params: Optional[Dict] = None) -> Union[
|
|
|
24
24
|
try:
|
|
25
25
|
response = http.request("GET", endpoint, fields=params, headers=headers)
|
|
26
26
|
if response.status != 200:
|
|
27
|
-
|
|
27
|
+
error_details = response.data.decode("utf-8")
|
|
28
|
+
raise urllib3.exceptions.HTTPError(f"Request failed with status {response.status}: {error_details}")
|
|
28
29
|
|
|
29
30
|
return json.loads(response.data.decode("utf-8"))
|
|
30
31
|
finally:
|