hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +16 -3
- cli/config.py +45 -4
- cli/consts.py +1 -1
- cli/dataset_cmds.py +6 -14
- cli/dataset_recipe_cmds.py +78 -0
- cli/experiment_cmds.py +226 -43
- cli/keychain.py +88 -0
- cli/profile_cmds.py +10 -6
- cli/runc_cmds.py +5 -5
- cli/trainer_package_cmds.py +65 -0
- hafnia/__init__.py +2 -0
- hafnia/data/factory.py +1 -2
- hafnia/dataset/dataset_helpers.py +9 -14
- hafnia/dataset/dataset_names.py +10 -5
- hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
- hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
- hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
- hafnia/dataset/dataset_upload_helper.py +265 -56
- hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
- hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
- hafnia/dataset/hafnia_dataset.py +577 -213
- hafnia/dataset/license_types.py +63 -0
- hafnia/dataset/operations/dataset_stats.py +259 -3
- hafnia/dataset/operations/dataset_transformations.py +332 -7
- hafnia/dataset/operations/table_transformations.py +43 -5
- hafnia/dataset/primitives/__init__.py +8 -0
- hafnia/dataset/primitives/bbox.py +25 -12
- hafnia/dataset/primitives/bitmask.py +26 -14
- hafnia/dataset/primitives/classification.py +16 -8
- hafnia/dataset/primitives/point.py +7 -3
- hafnia/dataset/primitives/polygon.py +16 -9
- hafnia/dataset/primitives/segmentation.py +10 -7
- hafnia/experiment/hafnia_logger.py +111 -8
- hafnia/http.py +16 -2
- hafnia/platform/__init__.py +9 -3
- hafnia/platform/builder.py +12 -10
- hafnia/platform/dataset_recipe.py +104 -0
- hafnia/platform/datasets.py +47 -9
- hafnia/platform/download.py +25 -19
- hafnia/platform/experiment.py +51 -56
- hafnia/platform/trainer_package.py +57 -0
- hafnia/utils.py +81 -13
- hafnia/visualizations/image_visualizations.py +4 -4
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
- hafnia-0.4.0.dist-info/RECORD +56 -0
- cli/recipe_cmds.py +0 -45
- hafnia-0.2.4.dist-info/RECORD +0 -49
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -29,19 +29,27 @@ HafniaDataset class and a RecipeTransform class in the `data_recipe/recipe_trans
|
|
|
29
29
|
that the signatures match.
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
+
import json
|
|
33
|
+
import re
|
|
34
|
+
import textwrap
|
|
32
35
|
from pathlib import Path
|
|
33
|
-
from typing import TYPE_CHECKING, Callable
|
|
36
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
|
|
34
37
|
|
|
35
38
|
import cv2
|
|
39
|
+
import more_itertools
|
|
36
40
|
import numpy as np
|
|
37
41
|
import polars as pl
|
|
38
42
|
from PIL import Image
|
|
39
|
-
from
|
|
43
|
+
from rich.progress import track
|
|
40
44
|
|
|
41
45
|
from hafnia.dataset import dataset_helpers
|
|
46
|
+
from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, ColumnName, FieldName
|
|
47
|
+
from hafnia.dataset.primitives import get_primitive_type_from_string
|
|
48
|
+
from hafnia.dataset.primitives.primitive import Primitive
|
|
49
|
+
from hafnia.utils import remove_duplicates_preserve_order
|
|
42
50
|
|
|
43
|
-
if TYPE_CHECKING:
|
|
44
|
-
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
51
|
+
if TYPE_CHECKING: # Using 'TYPE_CHECKING' to avoid circular imports during type checking
|
|
52
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
|
|
45
53
|
|
|
46
54
|
|
|
47
55
|
### Image transformations ###
|
|
@@ -65,7 +73,8 @@ def transform_images(
|
|
|
65
73
|
path_image_folder = path_output / "data"
|
|
66
74
|
path_image_folder.mkdir(parents=True, exist_ok=True)
|
|
67
75
|
|
|
68
|
-
|
|
76
|
+
org_paths = dataset.samples[ColumnName.FILE_PATH].to_list()
|
|
77
|
+
for org_path in track(org_paths, description="Transform images"):
|
|
69
78
|
org_path = Path(org_path)
|
|
70
79
|
if not org_path.exists():
|
|
71
80
|
raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
|
|
@@ -78,5 +87,321 @@ def transform_images(
|
|
|
78
87
|
raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
|
|
79
88
|
new_paths.append(str(new_path))
|
|
80
89
|
|
|
81
|
-
table = dataset.samples.with_columns(pl.Series(new_paths).alias(
|
|
82
|
-
return dataset.
|
|
90
|
+
table = dataset.samples.with_columns(pl.Series(new_paths).alias(ColumnName.FILE_PATH))
|
|
91
|
+
return dataset.update_samples(table)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_task_info_from_task_name_and_primitive(
|
|
95
|
+
tasks: List["TaskInfo"],
|
|
96
|
+
task_name: Optional[str] = None,
|
|
97
|
+
primitive: Union[None, str, Type[Primitive]] = None,
|
|
98
|
+
) -> "TaskInfo":
|
|
99
|
+
if len(tasks) == 0:
|
|
100
|
+
raise ValueError("Dataset has no tasks defined.")
|
|
101
|
+
|
|
102
|
+
tasks_str = "\n".join([f"\t{task.__repr__()}" for task in tasks])
|
|
103
|
+
if task_name is None and primitive is None:
|
|
104
|
+
if len(tasks) == 1:
|
|
105
|
+
return tasks[0]
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
"For multiple tasks, you will need to specify 'task_name' or 'type_primitive' "
|
|
109
|
+
"to return a unique task. The dataset contains the following tasks: \n" + tasks_str
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if isinstance(primitive, str):
|
|
113
|
+
primitive = get_primitive_type_from_string(primitive)
|
|
114
|
+
|
|
115
|
+
tasks_filtered = tasks
|
|
116
|
+
if primitive is None:
|
|
117
|
+
tasks_filtered = [task for task in tasks if task.name == task_name]
|
|
118
|
+
|
|
119
|
+
if len(tasks_filtered) == 0:
|
|
120
|
+
raise ValueError(f"No task found with {task_name=}. Available tasks: \n {tasks_str}")
|
|
121
|
+
|
|
122
|
+
unique_primitives = set(task.primitive for task in tasks_filtered)
|
|
123
|
+
if len(unique_primitives) > 1:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Found multiple tasks with {task_name=} using different primitives {unique_primitives=}. "
|
|
126
|
+
"Please specify the primitive type to make it unique. "
|
|
127
|
+
f"The dataset contains the following tasks: \n {tasks_str}"
|
|
128
|
+
)
|
|
129
|
+
primitive = list(unique_primitives)[0]
|
|
130
|
+
|
|
131
|
+
if task_name is None:
|
|
132
|
+
tasks_filtered = [task for task in tasks if task.primitive == primitive]
|
|
133
|
+
if len(tasks_filtered) == 0:
|
|
134
|
+
raise ValueError(f"No task found with {primitive=}. Available tasks: \n {tasks_str}")
|
|
135
|
+
|
|
136
|
+
unique_task_names = set(task.name for task in tasks_filtered)
|
|
137
|
+
if len(unique_task_names) > 1:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Found multiple tasks with {primitive=} using different task names {unique_task_names=}. "
|
|
140
|
+
"Please specify the 'task_name' to make it unique."
|
|
141
|
+
f"The dataset contains the following tasks: \n {tasks_str}"
|
|
142
|
+
)
|
|
143
|
+
task_name = list(unique_task_names)[0]
|
|
144
|
+
|
|
145
|
+
tasks_filtered = [task for task in tasks_filtered if task.primitive == primitive and task.name == task_name]
|
|
146
|
+
if len(tasks_filtered) == 0:
|
|
147
|
+
raise ValueError(f"No task found with {task_name=} and {primitive=}. Available tasks: \n {tasks_str}")
|
|
148
|
+
|
|
149
|
+
if len(tasks_filtered) > 1:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"Multiple tasks found with {task_name=} and {primitive=}. "
|
|
152
|
+
f"This should never happen. The dataset contains the following tasks: \n {tasks_str}"
|
|
153
|
+
)
|
|
154
|
+
task = tasks_filtered[0]
|
|
155
|
+
return task
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def class_mapper(
|
|
159
|
+
dataset: "HafniaDataset",
|
|
160
|
+
class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
|
|
161
|
+
method: str = "strict",
|
|
162
|
+
primitive: Optional[Type[Primitive]] = None,
|
|
163
|
+
task_name: Optional[str] = None,
|
|
164
|
+
) -> "HafniaDataset":
|
|
165
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
166
|
+
|
|
167
|
+
if isinstance(class_mapping, list):
|
|
168
|
+
class_mapping = dict(class_mapping)
|
|
169
|
+
|
|
170
|
+
allowed_methods = ("strict", "remove_undefined", "keep_undefined")
|
|
171
|
+
if method not in allowed_methods:
|
|
172
|
+
raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
|
|
173
|
+
|
|
174
|
+
task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=primitive)
|
|
175
|
+
current_names = task.class_names or []
|
|
176
|
+
|
|
177
|
+
# Expand wildcard mappings e.g. {"Vehicle.*": "Vehicle"} to {"Vehicle.Car": "Vehicle", "Vehicle.Bus": "Vehicle"}
|
|
178
|
+
class_mapping = expand_class_mapping(class_mapping, current_names)
|
|
179
|
+
|
|
180
|
+
non_existing_mapping_names = set(class_mapping) - set(current_names)
|
|
181
|
+
if len(non_existing_mapping_names) > 0:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"The specified class mapping contains class names {list(non_existing_mapping_names)} "
|
|
184
|
+
f"that do not exist in the dataset task '{task.name}'. "
|
|
185
|
+
f"Available class names: {current_names}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
missing_class_names = [c for c in current_names if c not in class_mapping] # List-comprehension to preserve order
|
|
189
|
+
class_mapping = class_mapping.copy()
|
|
190
|
+
if method == "strict":
|
|
191
|
+
pass # Continue to strict mapping below
|
|
192
|
+
elif method == "remove_undefined":
|
|
193
|
+
for missing_class_name in missing_class_names:
|
|
194
|
+
class_mapping[missing_class_name] = OPS_REMOVE_CLASS
|
|
195
|
+
elif method == "keep_undefined":
|
|
196
|
+
for missing_class_name in missing_class_names:
|
|
197
|
+
class_mapping[missing_class_name] = missing_class_name
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
|
|
200
|
+
|
|
201
|
+
missing_class_names = [c for c in current_names if c not in class_mapping]
|
|
202
|
+
if len(missing_class_names) > 0:
|
|
203
|
+
error_msg = f"""\
|
|
204
|
+
The specified class mapping is not a strict mapping - meaning that all class names have not
|
|
205
|
+
been mapped to a new class name.
|
|
206
|
+
In the current mapping, the following classes {list(missing_class_names)} have not been mapped.
|
|
207
|
+
The currently specified mapping is:
|
|
208
|
+
{json.dumps(class_mapping, indent=2)}
|
|
209
|
+
A strict mapping will replace all old class names (dictionary keys) to new class names (dictionary values).
|
|
210
|
+
Please update the mapping to include all class names from the dataset task '{task.name}'.
|
|
211
|
+
To keep class map to the same name e.g. 'person' = 'person'
|
|
212
|
+
or remove class by using the '__REMOVE__' key, e.g. 'person': '__REMOVE__'."""
|
|
213
|
+
raise ValueError(textwrap.dedent(error_msg))
|
|
214
|
+
|
|
215
|
+
new_class_names = remove_duplicates_preserve_order(class_mapping.values())
|
|
216
|
+
|
|
217
|
+
if OPS_REMOVE_CLASS in new_class_names:
|
|
218
|
+
# Move __REMOVE__ to the end of the list if it exists
|
|
219
|
+
new_class_names.append(new_class_names.pop(new_class_names.index(OPS_REMOVE_CLASS)))
|
|
220
|
+
|
|
221
|
+
samples = dataset.samples
|
|
222
|
+
samples_updated = samples.with_columns(
|
|
223
|
+
pl.col(task.primitive.column_name())
|
|
224
|
+
.list.eval(
|
|
225
|
+
pl.element().struct.with_fields(
|
|
226
|
+
pl.when(pl.field(FieldName.TASK_NAME) == task.name)
|
|
227
|
+
.then(pl.field(FieldName.CLASS_NAME).replace_strict(class_mapping))
|
|
228
|
+
.otherwise(pl.field(FieldName.CLASS_NAME))
|
|
229
|
+
.alias(FieldName.CLASS_NAME)
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
.alias(task.primitive.column_name())
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Update class indices too
|
|
236
|
+
name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
|
|
237
|
+
samples_updated = samples_updated.with_columns(
|
|
238
|
+
pl.col(task.primitive.column_name())
|
|
239
|
+
.list.eval(
|
|
240
|
+
pl.element().struct.with_fields(
|
|
241
|
+
pl.when(pl.field(FieldName.TASK_NAME) == task.name)
|
|
242
|
+
.then(pl.field(FieldName.CLASS_NAME).replace_strict(name_2_idx_mapping))
|
|
243
|
+
.otherwise(pl.field(FieldName.CLASS_IDX))
|
|
244
|
+
.alias(FieldName.CLASS_IDX)
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
.alias(task.primitive.column_name())
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if OPS_REMOVE_CLASS in new_class_names: # Remove class_names that are mapped to REMOVE_CLASS
|
|
251
|
+
samples_updated = samples_updated.with_columns(
|
|
252
|
+
pl.col(task.primitive.column_name())
|
|
253
|
+
.list.filter(pl.element().struct.field(FieldName.CLASS_NAME) != OPS_REMOVE_CLASS)
|
|
254
|
+
.alias(task.primitive.column_name())
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
new_class_names = [c for c in new_class_names if c != OPS_REMOVE_CLASS]
|
|
258
|
+
|
|
259
|
+
new_task = task.model_copy(deep=True)
|
|
260
|
+
new_task.class_names = new_class_names
|
|
261
|
+
dataset_info = dataset.info.replace_task(old_task=task, new_task=new_task)
|
|
262
|
+
return HafniaDataset(info=dataset_info, samples=samples_updated)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def expand_class_mapping(wildcard_mapping: Dict[str, str], class_names: List[str]) -> Dict[str, str]:
|
|
266
|
+
"""
|
|
267
|
+
Expand a wildcard class mapping to a full explicit mapping.
|
|
268
|
+
|
|
269
|
+
This function takes a mapping that may contain wildcard patterns (using '*')
|
|
270
|
+
and expands them to match actual class names from a dataset. Exact matches
|
|
271
|
+
take precedence over wildcard patterns.
|
|
272
|
+
|
|
273
|
+
Examples:
|
|
274
|
+
>>> from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS
|
|
275
|
+
>>> wildcard_mapping = {
|
|
276
|
+
... "Person": "Person",
|
|
277
|
+
... "Vehicle.*": "Vehicle",
|
|
278
|
+
... "Vehicle.Trailer": OPS_REMOVE_CLASS
|
|
279
|
+
... }
|
|
280
|
+
>>> class_names = [
|
|
281
|
+
... "Person", "Vehicle.Car", "Vehicle.Trailer", "Vehicle.Bus", "Animal.Dog"
|
|
282
|
+
... ]
|
|
283
|
+
>>> result = expand_wildcard_mapping(wildcard_mapping, class_names)
|
|
284
|
+
>>> print(result)
|
|
285
|
+
{
|
|
286
|
+
"Person": "Person",
|
|
287
|
+
"Vehicle.Car": "Vehicle",
|
|
288
|
+
"Vehicle.Trailer": OPS_REMOVE_CLASS, # Exact match overrides wildcard
|
|
289
|
+
"Vehicle.Bus": "Vehicle",
|
|
290
|
+
# Note: "Animal.Dog" is not included as it doesn't match any pattern
|
|
291
|
+
}
|
|
292
|
+
"""
|
|
293
|
+
expanded_mapping = {}
|
|
294
|
+
for match_pattern, mapping_value in wildcard_mapping.items():
|
|
295
|
+
if "*" in match_pattern:
|
|
296
|
+
# Convert wildcard pattern to regex: Escape special regex characters except *, then replace * with .*
|
|
297
|
+
regex_pattern = re.escape(match_pattern).replace("\\*", ".*")
|
|
298
|
+
class_names_matched = [cn for cn in class_names if re.fullmatch(regex_pattern, cn)]
|
|
299
|
+
expanded_mapping.update({cn: mapping_value for cn in class_names_matched})
|
|
300
|
+
else:
|
|
301
|
+
expanded_mapping.pop(match_pattern, None)
|
|
302
|
+
expanded_mapping[match_pattern] = mapping_value
|
|
303
|
+
return expanded_mapping
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def rename_task(
|
|
307
|
+
dataset: "HafniaDataset",
|
|
308
|
+
old_task_name: str,
|
|
309
|
+
new_task_name: str,
|
|
310
|
+
) -> "HafniaDataset":
|
|
311
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
312
|
+
|
|
313
|
+
old_task = dataset.info.get_task_by_name(task_name=old_task_name)
|
|
314
|
+
new_task = old_task.model_copy(deep=True)
|
|
315
|
+
new_task.name = new_task_name
|
|
316
|
+
samples = dataset.samples.with_columns(
|
|
317
|
+
pl.col(old_task.primitive.column_name())
|
|
318
|
+
.list.eval(
|
|
319
|
+
pl.element().struct.with_fields(
|
|
320
|
+
pl.field(FieldName.TASK_NAME).replace(old_task.name, new_task.name).alias(FieldName.TASK_NAME)
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
.alias(new_task.primitive.column_name())
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
dataset_info = dataset.info.replace_task(old_task=old_task, new_task=new_task)
|
|
327
|
+
return HafniaDataset(info=dataset_info, samples=samples)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def select_samples_by_class_name(
|
|
331
|
+
dataset: "HafniaDataset",
|
|
332
|
+
name: Union[List[str], str],
|
|
333
|
+
task_name: Optional[str] = None,
|
|
334
|
+
primitive: Optional[Type[Primitive]] = None,
|
|
335
|
+
) -> "HafniaDataset":
|
|
336
|
+
task, class_names = _validate_inputs_select_samples_by_class_name(
|
|
337
|
+
dataset=dataset,
|
|
338
|
+
name=name,
|
|
339
|
+
task_name=task_name,
|
|
340
|
+
primitive=primitive,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
samples = dataset.samples.filter(
|
|
344
|
+
pl.col(task.primitive.column_name())
|
|
345
|
+
.list.eval(
|
|
346
|
+
pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names)
|
|
347
|
+
& (pl.element().struct.field(FieldName.TASK_NAME) == task.name)
|
|
348
|
+
)
|
|
349
|
+
.list.any()
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
dataset_updated = dataset.update_samples(samples)
|
|
353
|
+
return dataset_updated
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _validate_inputs_select_samples_by_class_name(
|
|
357
|
+
dataset: "HafniaDataset",
|
|
358
|
+
name: Union[List[str], str],
|
|
359
|
+
task_name: Optional[str] = None,
|
|
360
|
+
primitive: Optional[Type[Primitive]] = None,
|
|
361
|
+
) -> Tuple["TaskInfo", List[str]]:
|
|
362
|
+
if isinstance(name, str):
|
|
363
|
+
name = [name]
|
|
364
|
+
names = list(name)
|
|
365
|
+
|
|
366
|
+
# Check that specified names are available in at least one of the tasks
|
|
367
|
+
available_names_across_tasks = set(more_itertools.flatten([t.class_names for t in dataset.info.tasks]))
|
|
368
|
+
missing_class_names_across_tasks = set(names) - available_names_across_tasks
|
|
369
|
+
if len(missing_class_names_across_tasks) > 0:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"The specified names {list(names)} have not been found in any of the tasks. "
|
|
372
|
+
f"Available class names: {available_names_across_tasks}"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Auto infer task if task_name and primitive are not provided
|
|
376
|
+
if task_name is None and primitive is None:
|
|
377
|
+
tasks_with_names = [t for t in dataset.info.tasks if set(names).issubset(t.class_names or [])]
|
|
378
|
+
if len(tasks_with_names) == 0:
|
|
379
|
+
raise ValueError(
|
|
380
|
+
f"The specified names {names} have not been found in any of the tasks. "
|
|
381
|
+
f"Available class names: {available_names_across_tasks}"
|
|
382
|
+
)
|
|
383
|
+
if len(tasks_with_names) > 1:
|
|
384
|
+
raise ValueError(
|
|
385
|
+
f"Found multiple tasks containing the specified names {names}. "
|
|
386
|
+
f"Specify either 'task_name' or 'primitive' to only select from one task. "
|
|
387
|
+
f"Tasks containing all provided names: {[t.name for t in tasks_with_names]}"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
task = tasks_with_names[0]
|
|
391
|
+
|
|
392
|
+
else:
|
|
393
|
+
task = get_task_info_from_task_name_and_primitive(
|
|
394
|
+
tasks=dataset.info.tasks,
|
|
395
|
+
task_name=task_name,
|
|
396
|
+
primitive=primitive,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
task_class_names = set(task.class_names or [])
|
|
400
|
+
missing_class_names = set(names) - task_class_names
|
|
401
|
+
if len(missing_class_names) > 0:
|
|
402
|
+
raise ValueError(
|
|
403
|
+
f"The specified names {list(missing_class_names)} have not been found for the '{task.name}' task. "
|
|
404
|
+
f"Available class names: {task_class_names}"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return task, names
|
|
@@ -2,11 +2,12 @@ from pathlib import Path
|
|
|
2
2
|
from typing import List, Optional, Type
|
|
3
3
|
|
|
4
4
|
import polars as pl
|
|
5
|
-
from
|
|
5
|
+
from rich.progress import track
|
|
6
6
|
|
|
7
7
|
from hafnia.dataset.dataset_names import (
|
|
8
8
|
FILENAME_ANNOTATIONS_JSONL,
|
|
9
9
|
FILENAME_ANNOTATIONS_PARQUET,
|
|
10
|
+
ColumnName,
|
|
10
11
|
FieldName,
|
|
11
12
|
)
|
|
12
13
|
from hafnia.dataset.operations import table_transformations
|
|
@@ -34,8 +35,12 @@ def create_primitive_table(
|
|
|
34
35
|
|
|
35
36
|
if keep_sample_data:
|
|
36
37
|
# Drop other primitive columns to avoid conflicts
|
|
37
|
-
|
|
38
|
-
|
|
38
|
+
|
|
39
|
+
drop_columns_primitives = set(PRIMITIVE_TYPES) - {PrimitiveType, Classification}
|
|
40
|
+
drop_columns_names = [primitive.column_name() for primitive in drop_columns_primitives]
|
|
41
|
+
drop_columns_names = [c for c in drop_columns_names if c in remove_no_object_frames.columns]
|
|
42
|
+
|
|
43
|
+
remove_no_object_frames = remove_no_object_frames.drop(drop_columns_names)
|
|
39
44
|
# Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
|
|
40
45
|
remove_no_object_frames = remove_no_object_frames.rename(
|
|
41
46
|
{"height": "image.height", "width": "image.width", "meta": "image.meta"}
|
|
@@ -46,6 +51,38 @@ def create_primitive_table(
|
|
|
46
51
|
return objects_df
|
|
47
52
|
|
|
48
53
|
|
|
54
|
+
def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFrame:
|
|
55
|
+
has_same_schema = samples0.schema == samples1.schema
|
|
56
|
+
if not has_same_schema:
|
|
57
|
+
shared_columns = []
|
|
58
|
+
for column_name, column_type in samples0.schema.items():
|
|
59
|
+
if column_name not in samples1.schema:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
if column_type != samples1.schema[column_name]:
|
|
63
|
+
continue
|
|
64
|
+
shared_columns.append(column_name)
|
|
65
|
+
|
|
66
|
+
dropped_columns0 = [
|
|
67
|
+
f"{n}[{ctype._string_repr()}]" for n, ctype in samples0.schema.items() if n not in shared_columns
|
|
68
|
+
]
|
|
69
|
+
dropped_columns1 = [
|
|
70
|
+
f"{n}[{ctype._string_repr()}]" for n, ctype in samples1.schema.items() if n not in shared_columns
|
|
71
|
+
]
|
|
72
|
+
user_logger.warning(
|
|
73
|
+
"Datasets with different schemas are being merged. "
|
|
74
|
+
"Only the columns with the same name and type will be kept in the merged dataset.\n"
|
|
75
|
+
f"Dropped columns in samples0: {dropped_columns0}\n"
|
|
76
|
+
f"Dropped columns in samples1: {dropped_columns1}\n"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
samples0 = samples0.select(list(shared_columns))
|
|
80
|
+
samples1 = samples1.select(list(shared_columns))
|
|
81
|
+
merged_samples = pl.concat([samples0, samples1], how="vertical")
|
|
82
|
+
merged_samples = merged_samples.drop(ColumnName.SAMPLE_INDEX).with_row_index(name=ColumnName.SAMPLE_INDEX)
|
|
83
|
+
return merged_samples
|
|
84
|
+
|
|
85
|
+
|
|
49
86
|
def filter_table_for_class_names(
|
|
50
87
|
samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
|
|
51
88
|
) -> Optional[pl.DataFrame]:
|
|
@@ -107,7 +144,7 @@ def split_primitive_columns_by_task_name(
|
|
|
107
144
|
return samples_table
|
|
108
145
|
|
|
109
146
|
|
|
110
|
-
def
|
|
147
|
+
def read_samples_from_path(path: Path) -> pl.DataFrame:
|
|
111
148
|
path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
|
|
112
149
|
if path_annotations.exists():
|
|
113
150
|
user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
|
|
@@ -125,7 +162,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
|
|
|
125
162
|
|
|
126
163
|
def check_image_paths(table: pl.DataFrame) -> bool:
|
|
127
164
|
missing_files = []
|
|
128
|
-
|
|
165
|
+
org_paths = table[ColumnName.FILE_PATH].to_list()
|
|
166
|
+
for org_path in track(org_paths, description="Check image paths"):
|
|
129
167
|
org_path = Path(org_path)
|
|
130
168
|
if not org_path.exists():
|
|
131
169
|
missing_files.append(org_path)
|
|
@@ -14,3 +14,11 @@ from .utils import class_color_by_name # noqa: F401
|
|
|
14
14
|
PRIMITIVE_TYPES: List[Type[Primitive]] = [Bbox, Classification, Polygon, Bitmask]
|
|
15
15
|
PRIMITIVE_NAME_TO_TYPE = {cls.__name__: cls for cls in PRIMITIVE_TYPES}
|
|
16
16
|
PRIMITIVE_COLUMN_NAMES: List[str] = [PrimitiveType.column_name() for PrimitiveType in PRIMITIVE_TYPES]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_primitive_type_from_string(name: str) -> Type[Primitive]:
|
|
20
|
+
if name not in PRIMITIVE_NAME_TO_TYPE:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Primitive '{name}' is not recognized. Available primitives: {list(PRIMITIVE_NAME_TO_TYPE.keys())}"
|
|
23
|
+
)
|
|
24
|
+
return PRIMITIVE_NAME_TO_TYPE[name]
|
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
4
4
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
|
+
from pydantic import Field
|
|
7
8
|
|
|
8
9
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
9
10
|
from hafnia.dataset.primitives.utils import (
|
|
@@ -17,18 +18,30 @@ from hafnia.dataset.primitives.utils import (
|
|
|
17
18
|
|
|
18
19
|
class Bbox(Primitive):
|
|
19
20
|
# Names should match names in FieldName
|
|
20
|
-
height: float
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
21
|
+
height: float = Field(
|
|
22
|
+
description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
|
|
23
|
+
)
|
|
24
|
+
width: float = Field(
|
|
25
|
+
description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
|
|
26
|
+
)
|
|
27
|
+
top_left_x: float = Field(
|
|
28
|
+
description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
|
|
29
|
+
)
|
|
30
|
+
top_left_y: float = Field(
|
|
31
|
+
description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
|
|
32
|
+
)
|
|
33
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
34
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
35
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
36
|
+
confidence: Optional[float] = Field(
|
|
37
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
|
|
38
|
+
)
|
|
39
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
40
|
+
|
|
41
|
+
task_name: str = Field(
|
|
42
|
+
default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
|
|
43
|
+
)
|
|
44
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
32
45
|
|
|
33
46
|
@staticmethod
|
|
34
47
|
def default_task_name() -> str:
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional, Tuple
|
|
|
5
5
|
import cv2
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pycocotools.mask as coco_mask
|
|
8
|
+
from pydantic import Field
|
|
8
9
|
|
|
10
|
+
from hafnia.dataset.dataset_names import FieldName
|
|
9
11
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
10
12
|
from hafnia.dataset.primitives.utils import (
|
|
11
13
|
anonymize_by_resizing,
|
|
@@ -14,23 +16,33 @@ from hafnia.dataset.primitives.utils import (
|
|
|
14
16
|
text_org_from_left_bottom_to_centered,
|
|
15
17
|
)
|
|
16
18
|
|
|
19
|
+
FieldName
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
class Bitmask(Primitive):
|
|
19
23
|
# Names should match names in FieldName
|
|
20
|
-
top: int
|
|
21
|
-
left: int
|
|
22
|
-
height: int
|
|
23
|
-
width: int
|
|
24
|
-
rleString: str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
top: int = Field(description="Bitmask top coordinate in pixels ")
|
|
25
|
+
left: int = Field(description="Bitmask left coordinate in pixels")
|
|
26
|
+
height: int = Field(description="Bitmask height of the bounding box in pixels")
|
|
27
|
+
width: int = Field(description="Bitmask width of the bounding box in pixels")
|
|
28
|
+
rleString: str = Field(
|
|
29
|
+
description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
|
|
30
|
+
)
|
|
31
|
+
area: Optional[float] = Field(
|
|
32
|
+
default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
|
|
33
|
+
)
|
|
34
|
+
class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
|
|
35
|
+
class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
|
|
36
|
+
object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
|
|
37
|
+
confidence: Optional[float] = Field(
|
|
38
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
|
|
39
|
+
)
|
|
40
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
41
|
+
|
|
42
|
+
task_name: str = Field(
|
|
43
|
+
default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
|
|
44
|
+
)
|
|
45
|
+
meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
|
|
34
46
|
|
|
35
47
|
@staticmethod
|
|
36
48
|
def default_task_name() -> str:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
from pydantic import Field
|
|
4
5
|
|
|
5
6
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
6
7
|
from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
|
|
@@ -8,14 +9,21 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
|
|
|
8
9
|
|
|
9
10
|
class Classification(Primitive):
|
|
10
11
|
# Names should match names in FieldName
|
|
11
|
-
class_name: Optional[str] = None
|
|
12
|
-
class_idx: Optional[int] = None
|
|
13
|
-
object_id: Optional[str] = None
|
|
14
|
-
confidence: Optional[float] =
|
|
15
|
-
|
|
12
|
+
class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
|
|
13
|
+
class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
|
|
14
|
+
object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
|
|
15
|
+
confidence: Optional[float] = Field(
|
|
16
|
+
default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
|
|
17
|
+
)
|
|
18
|
+
ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
|
|
16
19
|
|
|
17
|
-
task_name: str =
|
|
18
|
-
|
|
20
|
+
task_name: str = Field(
|
|
21
|
+
default="",
|
|
22
|
+
description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
|
|
23
|
+
)
|
|
24
|
+
meta: Optional[Dict[str, Any]] = Field(
|
|
25
|
+
default=None, description="This can be used to store additional information about the classification"
|
|
26
|
+
)
|
|
19
27
|
|
|
20
28
|
@staticmethod
|
|
21
29
|
def default_task_name() -> str:
|
|
@@ -38,7 +46,7 @@ class Classification(Primitive):
|
|
|
38
46
|
text = class_name
|
|
39
47
|
else:
|
|
40
48
|
text = f"{self.task_name}: {class_name}"
|
|
41
|
-
image = image_visualizations.append_text_below_frame(image, text=text)
|
|
49
|
+
image = image_visualizations.append_text_below_frame(image, text=text, text_size_ratio=0.05)
|
|
42
50
|
|
|
43
51
|
return image
|
|
44
52
|
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from typing import Any, Tuple
|
|
2
2
|
|
|
3
|
-
from pydantic import BaseModel
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
5
|
from hafnia.dataset.primitives.utils import clip
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Point(BaseModel):
|
|
9
|
-
x: float
|
|
10
|
-
|
|
9
|
+
x: float = Field(
|
|
10
|
+
description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
|
|
11
|
+
)
|
|
12
|
+
y: float = Field(
|
|
13
|
+
description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
def to_pixel_coordinates(
|
|
13
17
|
self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True
|