hafnia 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +3 -1
- cli/config.py +43 -3
- cli/keychain.py +88 -0
- cli/profile_cmds.py +5 -2
- hafnia/__init__.py +1 -1
- hafnia/dataset/dataset_helpers.py +9 -2
- hafnia/dataset/dataset_names.py +2 -1
- hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
- hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
- hafnia/dataset/dataset_upload_helper.py +60 -4
- hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
- hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
- hafnia/dataset/hafnia_dataset.py +176 -50
- hafnia/dataset/operations/dataset_stats.py +2 -3
- hafnia/dataset/operations/dataset_transformations.py +19 -15
- hafnia/dataset/operations/table_transformations.py +4 -3
- hafnia/dataset/primitives/bbox.py +25 -12
- hafnia/dataset/primitives/bitmask.py +26 -14
- hafnia/dataset/primitives/classification.py +16 -8
- hafnia/dataset/primitives/point.py +7 -3
- hafnia/dataset/primitives/polygon.py +16 -9
- hafnia/dataset/primitives/segmentation.py +10 -7
- hafnia/experiment/hafnia_logger.py +0 -9
- hafnia/platform/dataset_recipe.py +7 -2
- hafnia/platform/datasets.py +3 -3
- hafnia/platform/download.py +23 -18
- hafnia/utils.py +17 -0
- hafnia/visualizations/image_visualizations.py +1 -1
- {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/METADATA +8 -6
- hafnia-0.4.0.dist-info/RECORD +56 -0
- hafnia-0.3.0.dist-info/RECORD +0 -53
- {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
- {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,17 +33,17 @@ import json
|
|
|
33
33
|
import re
|
|
34
34
|
import textwrap
|
|
35
35
|
from pathlib import Path
|
|
36
|
-
from typing import TYPE_CHECKING, Callable, Dict, List, Optional,
|
|
36
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
|
|
37
37
|
|
|
38
38
|
import cv2
|
|
39
39
|
import more_itertools
|
|
40
40
|
import numpy as np
|
|
41
41
|
import polars as pl
|
|
42
42
|
from PIL import Image
|
|
43
|
-
from
|
|
43
|
+
from rich.progress import track
|
|
44
44
|
|
|
45
45
|
from hafnia.dataset import dataset_helpers
|
|
46
|
-
from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, FieldName
|
|
46
|
+
from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, ColumnName, FieldName
|
|
47
47
|
from hafnia.dataset.primitives import get_primitive_type_from_string
|
|
48
48
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
49
49
|
from hafnia.utils import remove_duplicates_preserve_order
|
|
@@ -73,7 +73,8 @@ def transform_images(
|
|
|
73
73
|
path_image_folder = path_output / "data"
|
|
74
74
|
path_image_folder.mkdir(parents=True, exist_ok=True)
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
org_paths = dataset.samples[ColumnName.FILE_PATH].to_list()
|
|
77
|
+
for org_path in track(org_paths, description="Transform images"):
|
|
77
78
|
org_path = Path(org_path)
|
|
78
79
|
if not org_path.exists():
|
|
79
80
|
raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
|
|
@@ -86,7 +87,7 @@ def transform_images(
|
|
|
86
87
|
raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
|
|
87
88
|
new_paths.append(str(new_path))
|
|
88
89
|
|
|
89
|
-
table = dataset.samples.with_columns(pl.Series(new_paths).alias(
|
|
90
|
+
table = dataset.samples.with_columns(pl.Series(new_paths).alias(ColumnName.FILE_PATH))
|
|
90
91
|
return dataset.update_samples(table)
|
|
91
92
|
|
|
92
93
|
|
|
@@ -156,13 +157,16 @@ def get_task_info_from_task_name_and_primitive(
|
|
|
156
157
|
|
|
157
158
|
def class_mapper(
|
|
158
159
|
dataset: "HafniaDataset",
|
|
159
|
-
class_mapping: Dict[str, str],
|
|
160
|
+
class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
|
|
160
161
|
method: str = "strict",
|
|
161
162
|
primitive: Optional[Type[Primitive]] = None,
|
|
162
163
|
task_name: Optional[str] = None,
|
|
163
164
|
) -> "HafniaDataset":
|
|
164
165
|
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
165
166
|
|
|
167
|
+
if isinstance(class_mapping, list):
|
|
168
|
+
class_mapping = dict(class_mapping)
|
|
169
|
+
|
|
166
170
|
allowed_methods = ("strict", "remove_undefined", "keep_undefined")
|
|
167
171
|
if method not in allowed_methods:
|
|
168
172
|
raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
|
|
@@ -170,7 +174,7 @@ def class_mapper(
|
|
|
170
174
|
task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=primitive)
|
|
171
175
|
current_names = task.class_names or []
|
|
172
176
|
|
|
173
|
-
# Expand wildcard mappings
|
|
177
|
+
# Expand wildcard mappings e.g. {"Vehicle.*": "Vehicle"} to {"Vehicle.Car": "Vehicle", "Vehicle.Bus": "Vehicle"}
|
|
174
178
|
class_mapping = expand_class_mapping(class_mapping, current_names)
|
|
175
179
|
|
|
176
180
|
non_existing_mapping_names = set(class_mapping) - set(current_names)
|
|
@@ -213,7 +217,6 @@ def class_mapper(
|
|
|
213
217
|
if OPS_REMOVE_CLASS in new_class_names:
|
|
214
218
|
# Move __REMOVE__ to the end of the list if it exists
|
|
215
219
|
new_class_names.append(new_class_names.pop(new_class_names.index(OPS_REMOVE_CLASS)))
|
|
216
|
-
name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
|
|
217
220
|
|
|
218
221
|
samples = dataset.samples
|
|
219
222
|
samples_updated = samples.with_columns(
|
|
@@ -230,6 +233,7 @@ def class_mapper(
|
|
|
230
233
|
)
|
|
231
234
|
|
|
232
235
|
# Update class indices too
|
|
236
|
+
name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
|
|
233
237
|
samples_updated = samples_updated.with_columns(
|
|
234
238
|
pl.col(task.primitive.column_name())
|
|
235
239
|
.list.eval(
|
|
@@ -354,14 +358,14 @@ def _validate_inputs_select_samples_by_class_name(
|
|
|
354
358
|
name: Union[List[str], str],
|
|
355
359
|
task_name: Optional[str] = None,
|
|
356
360
|
primitive: Optional[Type[Primitive]] = None,
|
|
357
|
-
) -> Tuple["TaskInfo",
|
|
361
|
+
) -> Tuple["TaskInfo", List[str]]:
|
|
358
362
|
if isinstance(name, str):
|
|
359
363
|
name = [name]
|
|
360
|
-
names =
|
|
364
|
+
names = list(name)
|
|
361
365
|
|
|
362
366
|
# Check that specified names are available in at least one of the tasks
|
|
363
367
|
available_names_across_tasks = set(more_itertools.flatten([t.class_names for t in dataset.info.tasks]))
|
|
364
|
-
missing_class_names_across_tasks = names - available_names_across_tasks
|
|
368
|
+
missing_class_names_across_tasks = set(names) - available_names_across_tasks
|
|
365
369
|
if len(missing_class_names_across_tasks) > 0:
|
|
366
370
|
raise ValueError(
|
|
367
371
|
f"The specified names {list(names)} have not been found in any of the tasks. "
|
|
@@ -370,15 +374,15 @@ def _validate_inputs_select_samples_by_class_name(
|
|
|
370
374
|
|
|
371
375
|
# Auto infer task if task_name and primitive are not provided
|
|
372
376
|
if task_name is None and primitive is None:
|
|
373
|
-
tasks_with_names = [t for t in dataset.info.tasks if names.issubset(t.class_names or [])]
|
|
377
|
+
tasks_with_names = [t for t in dataset.info.tasks if set(names).issubset(t.class_names or [])]
|
|
374
378
|
if len(tasks_with_names) == 0:
|
|
375
379
|
raise ValueError(
|
|
376
|
-
f"The specified names {
|
|
380
|
+
f"The specified names {names} have not been found in any of the tasks. "
|
|
377
381
|
f"Available class names: {available_names_across_tasks}"
|
|
378
382
|
)
|
|
379
383
|
if len(tasks_with_names) > 1:
|
|
380
384
|
raise ValueError(
|
|
381
|
-
f"Found multiple tasks containing the specified names {
|
|
385
|
+
f"Found multiple tasks containing the specified names {names}. "
|
|
382
386
|
f"Specify either 'task_name' or 'primitive' to only select from one task. "
|
|
383
387
|
f"Tasks containing all provided names: {[t.name for t in tasks_with_names]}"
|
|
384
388
|
)
|
|
@@ -393,7 +397,7 @@ def _validate_inputs_select_samples_by_class_name(
|
|
|
393
397
|
)
|
|
394
398
|
|
|
395
399
|
task_class_names = set(task.class_names or [])
|
|
396
|
-
missing_class_names = names - task_class_names
|
|
400
|
+
missing_class_names = set(names) - task_class_names
|
|
397
401
|
if len(missing_class_names) > 0:
|
|
398
402
|
raise ValueError(
|
|
399
403
|
f"The specified names {list(missing_class_names)} have not been found for the '{task.name}' task. "
|
|
@@ -2,7 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
from typing import List, Optional, Type
|
|
3
3
|
|
|
4
4
|
import polars as pl
|
|
5
|
-
from
|
|
5
|
+
from rich.progress import track
|
|
6
6
|
|
|
7
7
|
from hafnia.dataset.dataset_names import (
|
|
8
8
|
FILENAME_ANNOTATIONS_JSONL,
|
|
@@ -144,7 +144,7 @@ def split_primitive_columns_by_task_name(
|
|
|
144
144
|
return samples_table
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def
|
|
147
|
+
def read_samples_from_path(path: Path) -> pl.DataFrame:
|
|
148
148
|
path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
|
|
149
149
|
if path_annotations.exists():
|
|
150
150
|
user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
|
|
@@ -162,7 +162,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
|
|
|
162
162
|
|
|
163
163
|
def check_image_paths(table: pl.DataFrame) -> bool:
|
|
164
164
|
missing_files = []
|
|
165
|
-
|
|
165
|
+
org_paths = table[ColumnName.FILE_PATH].to_list()
|
|
166
|
+
for org_path in track(org_paths, description="Check image paths"):
|
|
166
167
|
org_path = Path(org_path)
|
|
167
168
|
if not org_path.exists():
|
|
168
169
|
missing_files.append(org_path)
|
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
4
4
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
|
+
from pydantic import Field
|
|
7
8
|
|
|
8
9
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
9
10
|
from hafnia.dataset.primitives.utils import (
|
|
@@ -17,18 +18,30 @@ from hafnia.dataset.primitives.utils import (
|
|
|
17
18
|
|
|
18
19
|
class Bbox(Primitive):
|
|
19
20
|
# Names should match names in FieldName
|
|
20
|
-
height: float
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
21
|
+
height: float = Field(
|
|
22
|
+
description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
|
|
23
|
+
)
|
|
24
|
+
width: float = Field(
|
|
25
|
+
description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
|
|
26
|
+
)
|
|
27
|
+
top_left_x: float = Field(
|
|
28
|
+
description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
|
|
29
|
+
)
|
|
30
|
+
top_left_y: float = Field(
|
|
31
|
+
description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
|
|
32
|
+
)
|
|
33
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
34
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
35
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
36
|
+
confidence: Optional[float] = Field(
|
|
37
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
|
|
38
|
+
)
|
|
39
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
40
|
+
|
|
41
|
+
task_name: str = Field(
|
|
42
|
+
default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
|
|
43
|
+
)
|
|
44
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
32
45
|
|
|
33
46
|
@staticmethod
|
|
34
47
|
def default_task_name() -> str:
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional, Tuple
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pycocotools.mask as coco_mask
|
|
8
|
+
from pydantic import Field
|
|
8
9
|
|
|
10
|
+
from hafnia.dataset.dataset_names import FieldName
|
|
9
11
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
10
12
|
from hafnia.dataset.primitives.utils import (
|
|
11
13
|
anonymize_by_resizing,
|
|
@@ -14,23 +16,33 @@ from hafnia.dataset.primitives.utils import (
|
|
|
14
16
|
text_org_from_left_bottom_to_centered,
|
|
15
17
|
)
|
|
16
18
|
|
|
19
|
+
FieldName
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
class Bitmask(Primitive):
|
|
19
23
|
# Names should match names in FieldName
|
|
20
|
-
top: int
|
|
21
|
-
left: int
|
|
22
|
-
height: int
|
|
23
|
-
width: int
|
|
24
|
-
rleString: str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
top: int = Field(description="Bitmask top coordinate in pixels ")
|
|
25
|
+
left: int = Field(description="Bitmask left coordinate in pixels")
|
|
26
|
+
height: int = Field(description="Bitmask height of the bounding box in pixels")
|
|
27
|
+
width: int = Field(description="Bitmask width of the bounding box in pixels")
|
|
28
|
+
rleString: str = Field(
|
|
29
|
+
description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
|
|
30
|
+
)
|
|
31
|
+
area: Optional[float] = Field(
|
|
32
|
+
default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
|
|
33
|
+
)
|
|
34
|
+
class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
|
|
35
|
+
class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
|
|
36
|
+
object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
|
|
37
|
+
confidence: Optional[float] = Field(
|
|
38
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
|
|
39
|
+
)
|
|
40
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
41
|
+
|
|
42
|
+
task_name: str = Field(
|
|
43
|
+
default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
|
|
44
|
+
)
|
|
45
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
34
46
|
|
|
35
47
|
@staticmethod
|
|
36
48
|
def default_task_name() -> str:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
from pydantic import Field
|
|
4
5
|
|
|
5
6
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
6
7
|
from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
|
|
@@ -8,14 +9,21 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
|
|
|
8
9
|
|
|
9
10
|
class Classification(Primitive):
|
|
10
11
|
# Names should match names in FieldName
|
|
11
|
-
class_name: Optional[str] = None
|
|
12
|
-
class_idx: Optional[int] = None
|
|
13
|
-
object_id: Optional[str] = None
|
|
14
|
-
confidence: Optional[float] =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
12
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
13
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
14
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
15
|
+
confidence: Optional[float] = Field(
|
|
16
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
|
|
17
|
+
)
|
|
18
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
19
|
+
|
|
20
|
+
task_name: str = Field(
|
|
21
|
+
default="",
|
|
22
|
+
description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
|
|
23
|
+
)
|
|
24
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
25
|
+
default=None, description="This can be used to store additional information about the classification"
|
|
26
|
+
)
|
|
19
27
|
|
|
20
28
|
@staticmethod
|
|
21
29
|
def default_task_name() -> str:
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from typing import Any, Tuple
|
|
2
2
|
|
|
3
|
-
from pydantic import BaseModel
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
from hafnia.dataset.primitives.utils import clip
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Point(BaseModel):
|
|
9
|
-
x: float
|
|
10
|
-
|
|
9
|
+
x: float = Field(
|
|
10
|
+
description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
|
|
11
|
+
)
|
|
12
|
+
y: float = Field(
|
|
13
|
+
description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
def to_pixel_coordinates(
|
|
13
17
|
self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True
|
|
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
2
2
|
|
|
3
3
|
import cv2
|
|
4
4
|
import numpy as np
|
|
5
|
+
from pydantic import Field
|
|
5
6
|
|
|
6
7
|
from hafnia.dataset.primitives.bitmask import Bitmask
|
|
7
8
|
from hafnia.dataset.primitives.point import Point
|
|
@@ -11,15 +12,21 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
|
|
|
11
12
|
|
|
12
13
|
class Polygon(Primitive):
|
|
13
14
|
# Names should match names in FieldName
|
|
14
|
-
points: List[Point]
|
|
15
|
-
class_name: Optional[str] = None
|
|
16
|
-
class_idx: Optional[int] = None
|
|
17
|
-
object_id: Optional[str] = None
|
|
18
|
-
confidence: Optional[float] =
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
15
|
+
points: List[Point] = Field(description="List of points defining the polygon")
|
|
16
|
+
class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
|
|
17
|
+
class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
|
|
18
|
+
object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
|
|
19
|
+
confidence: Optional[float] = Field(
|
|
20
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
|
|
21
|
+
)
|
|
22
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
23
|
+
|
|
24
|
+
task_name: str = Field(
|
|
25
|
+
default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
|
|
26
|
+
)
|
|
27
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
28
|
+
default=None, description="This can be used to store additional information about the polygon"
|
|
29
|
+
)
|
|
23
30
|
|
|
24
31
|
@staticmethod
|
|
25
32
|
def from_list_of_points(
|
|
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
2
2
|
|
|
3
3
|
import cv2
|
|
4
4
|
import numpy as np
|
|
5
|
+
from pydantic import Field
|
|
5
6
|
|
|
6
7
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
7
8
|
from hafnia.dataset.primitives.utils import get_class_name
|
|
@@ -9,15 +10,17 @@ from hafnia.visualizations.colors import get_n_colors
|
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class Segmentation(Primitive):
|
|
12
|
-
#
|
|
13
|
-
class_names: Optional[List[str]] = None
|
|
14
|
-
ground_truth: bool = True
|
|
13
|
+
# WARNING: Segmentation masks have not been fully implemented yet
|
|
14
|
+
class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
|
|
15
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
17
|
+
task_name: str = Field(
|
|
18
|
+
default="",
|
|
19
|
+
description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
|
|
20
|
+
)
|
|
21
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
22
|
+
default=None, description="This can be used to store additional information about the segmentation"
|
|
19
23
|
)
|
|
20
|
-
meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
|
|
21
24
|
|
|
22
25
|
@staticmethod
|
|
23
26
|
def default_task_name() -> str:
|
|
@@ -12,8 +12,6 @@ import pyarrow as pa
|
|
|
12
12
|
import pyarrow.parquet as pq
|
|
13
13
|
from pydantic import BaseModel, field_validator
|
|
14
14
|
|
|
15
|
-
from hafnia.data.factory import load_dataset
|
|
16
|
-
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
17
15
|
from hafnia.log import sys_logger, user_logger
|
|
18
16
|
from hafnia.utils import is_hafnia_cloud_job, now_as_str
|
|
19
17
|
|
|
@@ -136,13 +134,6 @@ class HafniaLogger:
|
|
|
136
134
|
except Exception as e:
|
|
137
135
|
user_logger.error(f"Failed to initialize MLflow: {e}")
|
|
138
136
|
|
|
139
|
-
def load_dataset(self, dataset_name: str) -> HafniaDataset:
|
|
140
|
-
"""
|
|
141
|
-
Load a dataset from the specified path.
|
|
142
|
-
"""
|
|
143
|
-
self.dataset_name = dataset_name
|
|
144
|
-
return load_dataset(dataset_name)
|
|
145
|
-
|
|
146
137
|
def path_local_experiment(self) -> Path:
|
|
147
138
|
"""Get the path for local experiment."""
|
|
148
139
|
if is_hafnia_cloud_job():
|
|
@@ -11,12 +11,17 @@ from hafnia.utils import pretty_print_list_as_table, timed
|
|
|
11
11
|
|
|
12
12
|
@timed("Get or create dataset recipe")
|
|
13
13
|
def get_or_create_dataset_recipe(
|
|
14
|
-
recipe: dict,
|
|
14
|
+
recipe: dict,
|
|
15
|
+
endpoint: str,
|
|
16
|
+
api_key: str,
|
|
17
|
+
name: Optional[str] = None,
|
|
18
|
+
overwrite: bool = False,
|
|
15
19
|
) -> Optional[Dict]:
|
|
16
20
|
headers = {"Authorization": api_key}
|
|
17
|
-
data = {"template": {"body": recipe}}
|
|
21
|
+
data = {"template": {"body": recipe}, "overwrite": overwrite}
|
|
18
22
|
if name is not None:
|
|
19
23
|
data["name"] = name # type: ignore[assignment]
|
|
24
|
+
|
|
20
25
|
response = http.post(endpoint, headers=headers, data=data)
|
|
21
26
|
return response
|
|
22
27
|
|
hafnia/platform/datasets.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import rich
|
|
11
11
|
from rich import print as rprint
|
|
12
|
-
from
|
|
12
|
+
from rich.progress import track
|
|
13
13
|
|
|
14
14
|
from cli.config import Config
|
|
15
15
|
from hafnia import http, utils
|
|
@@ -122,7 +122,7 @@ def download_dataset_from_access_endpoint(
|
|
|
122
122
|
try:
|
|
123
123
|
fast_copy_files_s3(
|
|
124
124
|
src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
|
|
125
|
-
dst_paths=dataset.samples[ColumnName.
|
|
125
|
+
dst_paths=dataset.samples[ColumnName.FILE_PATH].to_list(),
|
|
126
126
|
append_envs=envs,
|
|
127
127
|
description="Downloading images",
|
|
128
128
|
)
|
|
@@ -196,7 +196,7 @@ def execute_s5cmd_commands(
|
|
|
196
196
|
|
|
197
197
|
error_lines = []
|
|
198
198
|
lines = []
|
|
199
|
-
for line in
|
|
199
|
+
for line in track(process.stdout, total=len(commands), description=description):
|
|
200
200
|
if "ERROR" in line or "error" in line:
|
|
201
201
|
error_lines.append(line.strip())
|
|
202
202
|
lines.append(line.strip())
|
hafnia/platform/download.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Dict
|
|
2
|
+
from typing import Dict, Optional
|
|
3
3
|
|
|
4
4
|
import boto3
|
|
5
5
|
from botocore.exceptions import ClientError
|
|
6
6
|
from pydantic import BaseModel, field_validator
|
|
7
|
-
from
|
|
7
|
+
from rich.progress import Progress
|
|
8
8
|
|
|
9
9
|
from hafnia.http import fetch
|
|
10
10
|
from hafnia.log import sys_logger, user_logger
|
|
@@ -125,13 +125,15 @@ def download_single_object(s3_client, bucket: str, object_key: str, output_dir:
|
|
|
125
125
|
return local_path
|
|
126
126
|
|
|
127
127
|
|
|
128
|
-
def download_resource(resource_url: str, destination: str, api_key: str) -> Dict:
|
|
128
|
+
def download_resource(resource_url: str, destination: str, api_key: str, prefix: Optional[str] = None) -> Dict:
|
|
129
129
|
"""
|
|
130
130
|
Downloads either a single file from S3 or all objects under a prefix.
|
|
131
131
|
|
|
132
132
|
Args:
|
|
133
133
|
resource_url (str): The URL or identifier used to fetch S3 credentials.
|
|
134
134
|
destination (str): Path to local directory where files will be stored.
|
|
135
|
+
api_key (str): API key for authentication when fetching credentials.
|
|
136
|
+
prefix (Optional[str]): If provided, only download objects under this prefix.
|
|
135
137
|
|
|
136
138
|
Returns:
|
|
137
139
|
Dict[str, Any]: A dictionary containing download info, e.g.:
|
|
@@ -147,7 +149,7 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
|
|
|
147
149
|
res_credentials = get_resource_credentials(resource_url, api_key)
|
|
148
150
|
|
|
149
151
|
bucket_name = res_credentials.bucket_name()
|
|
150
|
-
|
|
152
|
+
prefix = prefix or res_credentials.object_key()
|
|
151
153
|
|
|
152
154
|
output_path = Path(destination)
|
|
153
155
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
@@ -159,29 +161,32 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
|
|
|
159
161
|
)
|
|
160
162
|
downloaded_files = []
|
|
161
163
|
try:
|
|
162
|
-
s3_client.head_object(Bucket=bucket_name, Key=
|
|
163
|
-
local_file = download_single_object(s3_client, bucket_name,
|
|
164
|
+
s3_client.head_object(Bucket=bucket_name, Key=prefix)
|
|
165
|
+
local_file = download_single_object(s3_client, bucket_name, prefix, output_path)
|
|
164
166
|
downloaded_files.append(str(local_file))
|
|
165
167
|
user_logger.info(f"Downloaded single file: {local_file}")
|
|
166
168
|
|
|
167
169
|
except ClientError as e:
|
|
168
170
|
error_code = e.response.get("Error", {}).get("Code")
|
|
169
171
|
if error_code == "404":
|
|
170
|
-
sys_logger.debug(f"Object '{
|
|
171
|
-
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=
|
|
172
|
+
sys_logger.debug(f"Object '{prefix}' not found; trying as a prefix.")
|
|
173
|
+
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
|
172
174
|
contents = response.get("Contents", [])
|
|
173
175
|
|
|
174
176
|
if not contents:
|
|
175
|
-
raise ValueError(f"No objects found for prefix '{
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
177
|
+
raise ValueError(f"No objects found for prefix '{prefix}' in bucket '{bucket_name}'")
|
|
178
|
+
|
|
179
|
+
with Progress() as progress:
|
|
180
|
+
task = progress.add_task("Downloading files", total=len(contents))
|
|
181
|
+
for obj in contents:
|
|
182
|
+
sub_key = obj["Key"]
|
|
183
|
+
size_mb = obj.get("Size", 0) / 1024 / 1024
|
|
184
|
+
progress.update(task, description=f"Downloading {sub_key} ({size_mb:.2f} MB)")
|
|
185
|
+
local_file = download_single_object(s3_client, bucket_name, sub_key, output_path)
|
|
186
|
+
downloaded_files.append(local_file.as_posix())
|
|
187
|
+
progress.advance(task)
|
|
188
|
+
|
|
189
|
+
user_logger.info(f"Downloaded folder/prefix '{prefix}' with {len(downloaded_files)} object(s).")
|
|
185
190
|
else:
|
|
186
191
|
user_logger.error(f"Error checking object or prefix: {e}")
|
|
187
192
|
raise RuntimeError(f"Failed to check or download S3 resource: {e}") from e
|
hafnia/utils.py
CHANGED
|
@@ -63,6 +63,18 @@ def timed(label: str):
|
|
|
63
63
|
return decorator
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def get_path_hafnia_cache() -> Path:
|
|
67
|
+
return Path.home() / "hafnia"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_path_torchvision_downloads() -> Path:
|
|
71
|
+
return get_path_hafnia_cache() / "torchvision_downloads"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_path_hafnia_conversions() -> Path:
|
|
75
|
+
return get_path_hafnia_cache() / "hafnia_conversions"
|
|
76
|
+
|
|
77
|
+
|
|
66
78
|
def now_as_str() -> str:
|
|
67
79
|
"""Get the current date and time as a string."""
|
|
68
80
|
return datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
|
|
@@ -205,3 +217,8 @@ def remove_duplicates_preserve_order(seq: Iterable) -> List:
|
|
|
205
217
|
Remove duplicates from a list while preserving the order of elements.
|
|
206
218
|
"""
|
|
207
219
|
return list(more_itertools.unique_everseen(seq))
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def is_image_file(file_path: Path) -> bool:
|
|
223
|
+
image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif")
|
|
224
|
+
return file_path.suffix.lower() in image_extensions
|
|
@@ -193,7 +193,7 @@ def save_dataset_sample_set_visualizations(
|
|
|
193
193
|
image = draw_annotations(image, annotations, draw_settings=draw_settings)
|
|
194
194
|
|
|
195
195
|
pil_image = Image.fromarray(image)
|
|
196
|
-
path_image = path_output_folder / Path(sample.
|
|
196
|
+
path_image = path_output_folder / Path(sample.file_path).name
|
|
197
197
|
pil_image.save(path_image)
|
|
198
198
|
paths.append(path_image)
|
|
199
199
|
|