hafnia 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +3 -1
- cli/config.py +43 -3
- cli/keychain.py +88 -0
- cli/profile_cmds.py +5 -2
- hafnia/__init__.py +1 -1
- hafnia/dataset/dataset_helpers.py +9 -2
- hafnia/dataset/dataset_names.py +130 -16
- hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
- hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
- hafnia/dataset/dataset_upload_helper.py +83 -22
- hafnia/dataset/format_conversions/format_image_classification_folder.py +110 -0
- hafnia/dataset/format_conversions/format_yolo.py +164 -0
- hafnia/dataset/format_conversions/torchvision_datasets.py +287 -0
- hafnia/dataset/hafnia_dataset.py +396 -96
- hafnia/dataset/operations/dataset_stats.py +84 -73
- hafnia/dataset/operations/dataset_transformations.py +116 -47
- hafnia/dataset/operations/table_transformations.py +135 -17
- hafnia/dataset/primitives/bbox.py +25 -14
- hafnia/dataset/primitives/bitmask.py +22 -15
- hafnia/dataset/primitives/classification.py +16 -8
- hafnia/dataset/primitives/point.py +7 -3
- hafnia/dataset/primitives/polygon.py +15 -10
- hafnia/dataset/primitives/primitive.py +1 -1
- hafnia/dataset/primitives/segmentation.py +12 -9
- hafnia/experiment/hafnia_logger.py +0 -9
- hafnia/platform/dataset_recipe.py +7 -2
- hafnia/platform/datasets.py +5 -9
- hafnia/platform/download.py +24 -90
- hafnia/torch_helpers.py +12 -12
- hafnia/utils.py +17 -0
- hafnia/visualizations/image_visualizations.py +3 -1
- {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/METADATA +11 -9
- hafnia-0.4.1.dist-info/RECORD +57 -0
- hafnia-0.3.0.dist-info/RECORD +0 -53
- {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/WHEEL +0 -0
- {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/entry_points.txt +0 -0
- {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import List, Optional, Type
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple, Type
|
|
3
3
|
|
|
4
4
|
import polars as pl
|
|
5
|
-
from
|
|
5
|
+
from rich.progress import track
|
|
6
6
|
|
|
7
7
|
from hafnia.dataset.dataset_names import (
|
|
8
8
|
FILENAME_ANNOTATIONS_JSONL,
|
|
9
9
|
FILENAME_ANNOTATIONS_PARQUET,
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
PrimitiveField,
|
|
11
|
+
SampleField,
|
|
12
12
|
)
|
|
13
13
|
from hafnia.dataset.operations import table_transformations
|
|
14
14
|
from hafnia.dataset.primitives import PRIMITIVE_TYPES
|
|
@@ -16,9 +16,15 @@ from hafnia.dataset.primitives.classification import Classification
|
|
|
16
16
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
17
17
|
from hafnia.log import user_logger
|
|
18
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from hafnia.dataset.hafnia_dataset import TaskInfo
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
def create_primitive_table(
|
|
21
|
-
samples_table: pl.DataFrame,
|
|
24
|
+
samples_table: pl.DataFrame,
|
|
25
|
+
PrimitiveType: Type[Primitive],
|
|
26
|
+
keep_sample_data: bool = False,
|
|
27
|
+
task_name: Optional[str] = None,
|
|
22
28
|
) -> Optional[pl.DataFrame]:
|
|
23
29
|
"""
|
|
24
30
|
Returns a DataFrame with objects of the specified primitive type.
|
|
@@ -48,6 +54,9 @@ def create_primitive_table(
|
|
|
48
54
|
objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
|
|
49
55
|
else:
|
|
50
56
|
objects_df = remove_no_object_frames.select(pl.col(column_name).explode().struct.unnest())
|
|
57
|
+
|
|
58
|
+
if task_name is not None:
|
|
59
|
+
objects_df = objects_df.filter(pl.col(PrimitiveField.TASK_NAME) == task_name)
|
|
51
60
|
return objects_df
|
|
52
61
|
|
|
53
62
|
|
|
@@ -55,11 +64,12 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
|
|
|
55
64
|
has_same_schema = samples0.schema == samples1.schema
|
|
56
65
|
if not has_same_schema:
|
|
57
66
|
shared_columns = []
|
|
58
|
-
for column_name,
|
|
67
|
+
for column_name, s0_column_type in samples0.schema.items():
|
|
59
68
|
if column_name not in samples1.schema:
|
|
60
69
|
continue
|
|
70
|
+
samples0, samples1 = correction_of_list_struct_primitives(samples0, samples1, column_name)
|
|
61
71
|
|
|
62
|
-
if
|
|
72
|
+
if samples0.schema[column_name] != samples1.schema[column_name]:
|
|
63
73
|
continue
|
|
64
74
|
shared_columns.append(column_name)
|
|
65
75
|
|
|
@@ -79,16 +89,58 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
|
|
|
79
89
|
samples0 = samples0.select(list(shared_columns))
|
|
80
90
|
samples1 = samples1.select(list(shared_columns))
|
|
81
91
|
merged_samples = pl.concat([samples0, samples1], how="vertical")
|
|
82
|
-
merged_samples = merged_samples
|
|
92
|
+
merged_samples = add_sample_index(merged_samples)
|
|
83
93
|
return merged_samples
|
|
84
94
|
|
|
85
95
|
|
|
96
|
+
def correction_of_list_struct_primitives(
|
|
97
|
+
samples0: pl.DataFrame,
|
|
98
|
+
samples1: pl.DataFrame,
|
|
99
|
+
column_name: str,
|
|
100
|
+
) -> Tuple[pl.DataFrame, pl.DataFrame]:
|
|
101
|
+
"""
|
|
102
|
+
Corrects primitive columns (bboxes, polygons etc of type 'list[struct]') by removing non-matching struct fields
|
|
103
|
+
between two datasets. This is useful when merging two datasets with the same primitive (e.g. Bbox), where
|
|
104
|
+
some (less important) field types in the struct differ between the two datasets.
|
|
105
|
+
This issue often occurs with the 'meta' field as different dataset formats may store different metadata information.
|
|
106
|
+
"""
|
|
107
|
+
s0_column_type = samples0.schema[column_name]
|
|
108
|
+
s1_column_type = samples1.schema[column_name]
|
|
109
|
+
is_list_structs = s1_column_type == pl.List(pl.Struct) and s0_column_type == pl.List(pl.Struct)
|
|
110
|
+
is_non_matching_types = s1_column_type != s0_column_type
|
|
111
|
+
if is_list_structs and is_non_matching_types: # Only perform correction for list[struct] types that do not match
|
|
112
|
+
s0_fields = set(s0_column_type.inner.fields)
|
|
113
|
+
s1_fields = set(s1_column_type.inner.fields)
|
|
114
|
+
similar_fields = s0_fields.intersection(s1_fields)
|
|
115
|
+
s0_dropped_fields = s0_fields - similar_fields
|
|
116
|
+
if len(s0_dropped_fields) > 0:
|
|
117
|
+
samples0 = samples0.with_columns(
|
|
118
|
+
pl.col(column_name)
|
|
119
|
+
.list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
|
|
120
|
+
.alias(column_name)
|
|
121
|
+
)
|
|
122
|
+
s1_dropped_fields = s1_fields - similar_fields
|
|
123
|
+
if len(s1_dropped_fields) > 0:
|
|
124
|
+
samples1 = samples1.with_columns(
|
|
125
|
+
pl.col(column_name)
|
|
126
|
+
.list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
|
|
127
|
+
.alias(column_name)
|
|
128
|
+
)
|
|
129
|
+
user_logger.warning(
|
|
130
|
+
f"Primitive column '{column_name}' has none-matching fields in the two datasets. "
|
|
131
|
+
f"Dropping fields in samples0: {[f.name for f in s0_dropped_fields]}. "
|
|
132
|
+
f"Dropping fields in samples1: {[f.name for f in s1_dropped_fields]}."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return samples0, samples1
|
|
136
|
+
|
|
137
|
+
|
|
86
138
|
def filter_table_for_class_names(
|
|
87
139
|
samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
|
|
88
140
|
) -> Optional[pl.DataFrame]:
|
|
89
141
|
table_with_selected_class_names = samples_table.filter(
|
|
90
142
|
pl.col(PrimitiveType.column_name())
|
|
91
|
-
.list.eval(pl.element().struct.field(
|
|
143
|
+
.list.eval(pl.element().struct.field(PrimitiveField.CLASS_NAME).is_in(class_names))
|
|
92
144
|
.list.any()
|
|
93
145
|
)
|
|
94
146
|
|
|
@@ -100,20 +152,20 @@ def split_primitive_columns_by_task_name(
|
|
|
100
152
|
coordinate_types: Optional[List[Type[Primitive]]] = None,
|
|
101
153
|
) -> pl.DataFrame:
|
|
102
154
|
"""
|
|
103
|
-
Convert Primitive columns such as "
|
|
104
|
-
For example, if the "
|
|
155
|
+
Convert Primitive columns such as "bboxes" (Bbox) into a column for each task name.
|
|
156
|
+
For example, if the "bboxes" column (containing Bbox objects) has tasks "task1" and "task2".
|
|
105
157
|
|
|
106
158
|
|
|
107
159
|
This:
|
|
108
160
|
─┬────────────┬─
|
|
109
|
-
┆
|
|
161
|
+
┆ bboxes ┆
|
|
110
162
|
┆ --- ┆
|
|
111
163
|
┆ list[struc ┆
|
|
112
164
|
┆ t[11]] ┆
|
|
113
165
|
═╪════════════╪═
|
|
114
166
|
becomes this:
|
|
115
167
|
─┬────────────┬────────────┬─
|
|
116
|
-
┆
|
|
168
|
+
┆ bboxes. ┆ bboxes. ┆
|
|
117
169
|
┆ task1 ┆ task2 ┆
|
|
118
170
|
┆ --- ┆ --- ┆
|
|
119
171
|
┆ list[struc ┆ list[struc ┆
|
|
@@ -131,11 +183,11 @@ def split_primitive_columns_by_task_name(
|
|
|
131
183
|
if samples_table[col_name].dtype != pl.List(pl.Struct):
|
|
132
184
|
continue
|
|
133
185
|
|
|
134
|
-
task_names = samples_table[col_name].explode().struct.field(
|
|
186
|
+
task_names = samples_table[col_name].explode().struct.field(PrimitiveField.TASK_NAME).unique().to_list()
|
|
135
187
|
samples_table = samples_table.with_columns(
|
|
136
188
|
[
|
|
137
189
|
pl.col(col_name)
|
|
138
|
-
.list.filter(pl.element().struct.field(
|
|
190
|
+
.list.filter(pl.element().struct.field(PrimitiveField.TASK_NAME).eq(task_name))
|
|
139
191
|
.alias(f"{col_name}.{task_name}")
|
|
140
192
|
for task_name in task_names
|
|
141
193
|
]
|
|
@@ -144,7 +196,7 @@ def split_primitive_columns_by_task_name(
|
|
|
144
196
|
return samples_table
|
|
145
197
|
|
|
146
198
|
|
|
147
|
-
def
|
|
199
|
+
def read_samples_from_path(path: Path) -> pl.DataFrame:
|
|
148
200
|
path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
|
|
149
201
|
if path_annotations.exists():
|
|
150
202
|
user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
|
|
@@ -162,7 +214,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
|
|
|
162
214
|
|
|
163
215
|
def check_image_paths(table: pl.DataFrame) -> bool:
|
|
164
216
|
missing_files = []
|
|
165
|
-
|
|
217
|
+
org_paths = table[SampleField.FILE_PATH].to_list()
|
|
218
|
+
for org_path in track(org_paths, description="Check image paths"):
|
|
166
219
|
org_path = Path(org_path)
|
|
167
220
|
if not org_path.exists():
|
|
168
221
|
missing_files.append(org_path)
|
|
@@ -218,3 +271,68 @@ def unnest_classification_tasks(table: pl.DataFrame, strict: bool = True) -> pl.
|
|
|
218
271
|
|
|
219
272
|
table_out = table_out.with_columns([pl.col(c).list.first() for c in classification_columns])
|
|
220
273
|
return table_out
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def update_class_indices(samples: pl.DataFrame, task: "TaskInfo") -> pl.DataFrame:
|
|
277
|
+
if task.class_names is None or len(task.class_names) == 0:
|
|
278
|
+
raise ValueError(f"Task '{task.name}' does not have defined class names to update class indices.")
|
|
279
|
+
|
|
280
|
+
objs = (
|
|
281
|
+
samples[task.primitive.column_name()]
|
|
282
|
+
.explode()
|
|
283
|
+
.struct.unnest()
|
|
284
|
+
.filter(pl.col(PrimitiveField.TASK_NAME) == task.name)
|
|
285
|
+
)
|
|
286
|
+
expected_class_names = set(objs[PrimitiveField.CLASS_NAME].unique())
|
|
287
|
+
missing_class_names = expected_class_names - set(task.class_names)
|
|
288
|
+
if len(missing_class_names) > 0:
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Task '{task.name}' is missing class names: {missing_class_names}. Cannot update class indices."
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
name_2_idx_mapping = {name: idx for idx, name in enumerate(task.class_names)}
|
|
294
|
+
|
|
295
|
+
samples_updated = samples.with_columns(
|
|
296
|
+
pl.col(task.primitive.column_name())
|
|
297
|
+
.list.eval(
|
|
298
|
+
pl.element().struct.with_fields(
|
|
299
|
+
pl.when(pl.field(PrimitiveField.TASK_NAME) == task.name)
|
|
300
|
+
.then(pl.field(PrimitiveField.CLASS_NAME).replace_strict(name_2_idx_mapping, default=-1))
|
|
301
|
+
.otherwise(pl.field(PrimitiveField.CLASS_IDX))
|
|
302
|
+
.alias(PrimitiveField.CLASS_IDX)
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
.alias(task.primitive.column_name())
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
return samples_updated
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def add_sample_index(samples: pl.DataFrame) -> pl.DataFrame:
|
|
312
|
+
"""
|
|
313
|
+
Adds a sample index column to the samples DataFrame.
|
|
314
|
+
|
|
315
|
+
Note: Unlike the built-in 'polars.DataFrame.with_row_count', this function
|
|
316
|
+
always guarantees 'pl.UInt64' type for the index column.
|
|
317
|
+
"""
|
|
318
|
+
if SampleField.SAMPLE_INDEX in samples.columns:
|
|
319
|
+
samples = samples.drop(SampleField.SAMPLE_INDEX)
|
|
320
|
+
samples = samples.select(
|
|
321
|
+
pl.int_range(0, pl.count(), dtype=pl.UInt64).alias(SampleField.SAMPLE_INDEX),
|
|
322
|
+
pl.all(),
|
|
323
|
+
)
|
|
324
|
+
return samples
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def add_dataset_name_if_missing(table: pl.DataFrame, dataset_name: str) -> pl.DataFrame:
|
|
328
|
+
if SampleField.DATASET_NAME not in table.columns:
|
|
329
|
+
table = table.with_columns(pl.lit(dataset_name).alias(SampleField.DATASET_NAME))
|
|
330
|
+
else:
|
|
331
|
+
table = table.with_columns(
|
|
332
|
+
pl.when(pl.col(SampleField.DATASET_NAME).is_null())
|
|
333
|
+
.then(pl.lit(dataset_name))
|
|
334
|
+
.otherwise(pl.col(SampleField.DATASET_NAME))
|
|
335
|
+
.alias(SampleField.DATASET_NAME)
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
return table
|
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
4
4
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
|
+
from pydantic import Field
|
|
7
8
|
|
|
8
9
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
9
10
|
from hafnia.dataset.primitives.utils import (
|
|
@@ -17,26 +18,36 @@ from hafnia.dataset.primitives.utils import (
|
|
|
17
18
|
|
|
18
19
|
class Bbox(Primitive):
|
|
19
20
|
# Names should match names in FieldName
|
|
20
|
-
height: float
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
21
|
+
height: float = Field(
|
|
22
|
+
description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
|
|
23
|
+
)
|
|
24
|
+
width: float = Field(
|
|
25
|
+
description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
|
|
26
|
+
)
|
|
27
|
+
top_left_x: float = Field(
|
|
28
|
+
description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
|
|
29
|
+
)
|
|
30
|
+
top_left_y: float = Field(
|
|
31
|
+
description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
|
|
32
|
+
)
|
|
33
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
34
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
35
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
36
|
+
confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
|
|
37
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
38
|
+
|
|
39
|
+
task_name: str = Field(
|
|
40
|
+
default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
|
|
41
|
+
)
|
|
42
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
32
43
|
|
|
33
44
|
@staticmethod
|
|
34
45
|
def default_task_name() -> str:
|
|
35
|
-
return "
|
|
46
|
+
return "object_detection"
|
|
36
47
|
|
|
37
48
|
@staticmethod
|
|
38
49
|
def column_name() -> str:
|
|
39
|
-
return "
|
|
50
|
+
return "bboxes"
|
|
40
51
|
|
|
41
52
|
def calculate_area(self) -> float:
|
|
42
53
|
return self.height * self.width
|
|
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pycocotools.mask as coco_mask
|
|
8
|
+
from pydantic import Field
|
|
8
9
|
|
|
9
10
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
10
11
|
from hafnia.dataset.primitives.utils import (
|
|
@@ -17,24 +18,30 @@ from hafnia.dataset.primitives.utils import (
|
|
|
17
18
|
|
|
18
19
|
class Bitmask(Primitive):
|
|
19
20
|
# Names should match names in FieldName
|
|
20
|
-
top: int
|
|
21
|
-
left: int
|
|
22
|
-
height: int
|
|
23
|
-
width: int
|
|
24
|
-
rleString: str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
21
|
+
top: int = Field(description="Bitmask top coordinate in pixels ")
|
|
22
|
+
left: int = Field(description="Bitmask left coordinate in pixels")
|
|
23
|
+
height: int = Field(description="Bitmask height of the bounding box in pixels")
|
|
24
|
+
width: int = Field(description="Bitmask width of the bounding box in pixels")
|
|
25
|
+
rleString: str = Field(
|
|
26
|
+
description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
|
|
27
|
+
)
|
|
28
|
+
area: Optional[float] = Field(
|
|
29
|
+
default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
|
|
30
|
+
)
|
|
31
|
+
class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
|
|
32
|
+
class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
|
|
33
|
+
object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
|
|
34
|
+
confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
|
|
35
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
36
|
+
|
|
37
|
+
task_name: str = Field(
|
|
38
|
+
default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
|
|
39
|
+
)
|
|
40
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
34
41
|
|
|
35
42
|
@staticmethod
|
|
36
43
|
def default_task_name() -> str:
|
|
37
|
-
return "
|
|
44
|
+
return "mask_detection"
|
|
38
45
|
|
|
39
46
|
@staticmethod
|
|
40
47
|
def column_name() -> str:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
from pydantic import Field
|
|
4
5
|
|
|
5
6
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
6
7
|
from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
|
|
@@ -8,18 +9,25 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
|
|
|
8
9
|
|
|
9
10
|
class Classification(Primitive):
|
|
10
11
|
# Names should match names in FieldName
|
|
11
|
-
class_name: Optional[str] = None
|
|
12
|
-
class_idx: Optional[int] = None
|
|
13
|
-
object_id: Optional[str] = None
|
|
14
|
-
confidence:
|
|
15
|
-
|
|
12
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
13
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
14
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
15
|
+
confidence: float = Field(
|
|
16
|
+
default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
|
|
17
|
+
)
|
|
18
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
16
19
|
|
|
17
|
-
task_name: str =
|
|
18
|
-
|
|
20
|
+
task_name: str = Field(
|
|
21
|
+
default="",
|
|
22
|
+
description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
|
|
23
|
+
)
|
|
24
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
25
|
+
default=None, description="This can be used to store additional information about the classification"
|
|
26
|
+
)
|
|
19
27
|
|
|
20
28
|
@staticmethod
|
|
21
29
|
def default_task_name() -> str:
|
|
22
|
-
return "
|
|
30
|
+
return "image_classification"
|
|
23
31
|
|
|
24
32
|
@staticmethod
|
|
25
33
|
def column_name() -> str:
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from typing import Any, Tuple
|
|
2
2
|
|
|
3
|
-
from pydantic import BaseModel
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
from hafnia.dataset.primitives.utils import clip
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Point(BaseModel):
|
|
9
|
-
x: float
|
|
10
|
-
|
|
9
|
+
x: float = Field(
|
|
10
|
+
description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
|
|
11
|
+
)
|
|
12
|
+
y: float = Field(
|
|
13
|
+
description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
def to_pixel_coordinates(
|
|
13
17
|
self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True
|
|
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
2
2
|
|
|
3
3
|
import cv2
|
|
4
4
|
import numpy as np
|
|
5
|
+
from pydantic import Field
|
|
5
6
|
|
|
6
7
|
from hafnia.dataset.primitives.bitmask import Bitmask
|
|
7
8
|
from hafnia.dataset.primitives.point import Point
|
|
@@ -11,15 +12,19 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
|
|
|
11
12
|
|
|
12
13
|
class Polygon(Primitive):
|
|
13
14
|
# Names should match names in FieldName
|
|
14
|
-
points: List[Point]
|
|
15
|
-
class_name: Optional[str] = None
|
|
16
|
-
class_idx: Optional[int] = None
|
|
17
|
-
object_id: Optional[str] = None
|
|
18
|
-
confidence:
|
|
19
|
-
ground_truth: bool = True
|
|
20
|
-
|
|
21
|
-
task_name: str =
|
|
22
|
-
|
|
15
|
+
points: List[Point] = Field(description="List of points defining the polygon")
|
|
16
|
+
class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
|
|
17
|
+
class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
|
|
18
|
+
object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
|
|
19
|
+
confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
|
|
20
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
21
|
+
|
|
22
|
+
task_name: str = Field(
|
|
23
|
+
default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
|
|
24
|
+
)
|
|
25
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
26
|
+
default=None, description="This can be used to store additional information about the polygon"
|
|
27
|
+
)
|
|
23
28
|
|
|
24
29
|
@staticmethod
|
|
25
30
|
def from_list_of_points(
|
|
@@ -33,7 +38,7 @@ class Polygon(Primitive):
|
|
|
33
38
|
|
|
34
39
|
@staticmethod
|
|
35
40
|
def default_task_name() -> str:
|
|
36
|
-
return "
|
|
41
|
+
return "polygon_detection"
|
|
37
42
|
|
|
38
43
|
@staticmethod
|
|
39
44
|
def column_name() -> str:
|
|
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
2
2
|
|
|
3
3
|
import cv2
|
|
4
4
|
import numpy as np
|
|
5
|
+
from pydantic import Field
|
|
5
6
|
|
|
6
7
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
7
8
|
from hafnia.dataset.primitives.utils import get_class_name
|
|
@@ -9,23 +10,25 @@ from hafnia.visualizations.colors import get_n_colors
|
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class Segmentation(Primitive):
|
|
12
|
-
#
|
|
13
|
-
class_names: Optional[List[str]] = None
|
|
14
|
-
ground_truth: bool = True
|
|
13
|
+
# WARNING: Segmentation masks have not been fully implemented yet
|
|
14
|
+
class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
|
|
15
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
17
|
+
task_name: str = Field(
|
|
18
|
+
default="",
|
|
19
|
+
description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
|
|
20
|
+
)
|
|
21
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
22
|
+
default=None, description="This can be used to store additional information about the segmentation"
|
|
19
23
|
)
|
|
20
|
-
meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
|
|
21
24
|
|
|
22
25
|
@staticmethod
|
|
23
26
|
def default_task_name() -> str:
|
|
24
|
-
return "
|
|
27
|
+
return "semantic_segmentation"
|
|
25
28
|
|
|
26
29
|
@staticmethod
|
|
27
30
|
def column_name() -> str:
|
|
28
|
-
return "
|
|
31
|
+
return "segmentations"
|
|
29
32
|
|
|
30
33
|
def calculate_area(self) -> float:
|
|
31
34
|
raise NotImplementedError()
|
|
@@ -12,8 +12,6 @@ import pyarrow as pa
|
|
|
12
12
|
import pyarrow.parquet as pq
|
|
13
13
|
from pydantic import BaseModel, field_validator
|
|
14
14
|
|
|
15
|
-
from hafnia.data.factory import load_dataset
|
|
16
|
-
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
17
15
|
from hafnia.log import sys_logger, user_logger
|
|
18
16
|
from hafnia.utils import is_hafnia_cloud_job, now_as_str
|
|
19
17
|
|
|
@@ -136,13 +134,6 @@ class HafniaLogger:
|
|
|
136
134
|
except Exception as e:
|
|
137
135
|
user_logger.error(f"Failed to initialize MLflow: {e}")
|
|
138
136
|
|
|
139
|
-
def load_dataset(self, dataset_name: str) -> HafniaDataset:
|
|
140
|
-
"""
|
|
141
|
-
Load a dataset from the specified path.
|
|
142
|
-
"""
|
|
143
|
-
self.dataset_name = dataset_name
|
|
144
|
-
return load_dataset(dataset_name)
|
|
145
|
-
|
|
146
137
|
def path_local_experiment(self) -> Path:
|
|
147
138
|
"""Get the path for local experiment."""
|
|
148
139
|
if is_hafnia_cloud_job():
|
|
@@ -11,12 +11,17 @@ from hafnia.utils import pretty_print_list_as_table, timed
|
|
|
11
11
|
|
|
12
12
|
@timed("Get or create dataset recipe")
|
|
13
13
|
def get_or_create_dataset_recipe(
|
|
14
|
-
recipe: dict,
|
|
14
|
+
recipe: dict,
|
|
15
|
+
endpoint: str,
|
|
16
|
+
api_key: str,
|
|
17
|
+
name: Optional[str] = None,
|
|
18
|
+
overwrite: bool = False,
|
|
15
19
|
) -> Optional[Dict]:
|
|
16
20
|
headers = {"Authorization": api_key}
|
|
17
|
-
data = {"template": {"body": recipe}}
|
|
21
|
+
data = {"template": {"body": recipe}, "overwrite": overwrite}
|
|
18
22
|
if name is not None:
|
|
19
23
|
data["name"] = name # type: ignore[assignment]
|
|
24
|
+
|
|
20
25
|
response = http.post(endpoint, headers=headers, data=data)
|
|
21
26
|
return response
|
|
22
27
|
|
hafnia/platform/datasets.py
CHANGED
|
@@ -9,11 +9,11 @@ from typing import Any, Dict, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import rich
|
|
11
11
|
from rich import print as rprint
|
|
12
|
-
from
|
|
12
|
+
from rich.progress import track
|
|
13
13
|
|
|
14
14
|
from cli.config import Config
|
|
15
15
|
from hafnia import http, utils
|
|
16
|
-
from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
|
|
16
|
+
from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
|
|
17
17
|
from hafnia.dataset.dataset_recipe.dataset_recipe import (
|
|
18
18
|
DatasetRecipe,
|
|
19
19
|
get_dataset_path_from_recipe,
|
|
@@ -120,15 +120,11 @@ def download_dataset_from_access_endpoint(
|
|
|
120
120
|
return
|
|
121
121
|
dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
|
|
122
122
|
try:
|
|
123
|
-
|
|
124
|
-
src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
|
|
125
|
-
dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
|
|
126
|
-
append_envs=envs,
|
|
127
|
-
description="Downloading images",
|
|
128
|
-
)
|
|
123
|
+
dataset = dataset.download_files_aws(path_dataset, aws_credentials=resource_credentials, force_redownload=True)
|
|
129
124
|
except ValueError as e:
|
|
130
125
|
user_logger.error(f"Failed to download images: {e}")
|
|
131
126
|
return
|
|
127
|
+
dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
|
|
132
128
|
|
|
133
129
|
|
|
134
130
|
def fast_copy_files_s3(
|
|
@@ -196,7 +192,7 @@ def execute_s5cmd_commands(
|
|
|
196
192
|
|
|
197
193
|
error_lines = []
|
|
198
194
|
lines = []
|
|
199
|
-
for line in
|
|
195
|
+
for line in track(process.stdout, total=len(commands), description=description):
|
|
200
196
|
if "ERROR" in line or "error" in line:
|
|
201
197
|
error_lines.append(line.strip())
|
|
202
198
|
lines.append(line.strip())
|