hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. cli/__main__.py +16 -3
  2. cli/config.py +45 -4
  3. cli/consts.py +1 -1
  4. cli/dataset_cmds.py +6 -14
  5. cli/dataset_recipe_cmds.py +78 -0
  6. cli/experiment_cmds.py +226 -43
  7. cli/keychain.py +88 -0
  8. cli/profile_cmds.py +10 -6
  9. cli/runc_cmds.py +5 -5
  10. cli/trainer_package_cmds.py +65 -0
  11. hafnia/__init__.py +2 -0
  12. hafnia/data/factory.py +1 -2
  13. hafnia/dataset/dataset_helpers.py +9 -14
  14. hafnia/dataset/dataset_names.py +10 -5
  15. hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
  16. hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
  17. hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
  18. hafnia/dataset/dataset_upload_helper.py +265 -56
  19. hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
  20. hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
  21. hafnia/dataset/hafnia_dataset.py +577 -213
  22. hafnia/dataset/license_types.py +63 -0
  23. hafnia/dataset/operations/dataset_stats.py +259 -3
  24. hafnia/dataset/operations/dataset_transformations.py +332 -7
  25. hafnia/dataset/operations/table_transformations.py +43 -5
  26. hafnia/dataset/primitives/__init__.py +8 -0
  27. hafnia/dataset/primitives/bbox.py +25 -12
  28. hafnia/dataset/primitives/bitmask.py +26 -14
  29. hafnia/dataset/primitives/classification.py +16 -8
  30. hafnia/dataset/primitives/point.py +7 -3
  31. hafnia/dataset/primitives/polygon.py +16 -9
  32. hafnia/dataset/primitives/segmentation.py +10 -7
  33. hafnia/experiment/hafnia_logger.py +111 -8
  34. hafnia/http.py +16 -2
  35. hafnia/platform/__init__.py +9 -3
  36. hafnia/platform/builder.py +12 -10
  37. hafnia/platform/dataset_recipe.py +104 -0
  38. hafnia/platform/datasets.py +47 -9
  39. hafnia/platform/download.py +25 -19
  40. hafnia/platform/experiment.py +51 -56
  41. hafnia/platform/trainer_package.py +57 -0
  42. hafnia/utils.py +81 -13
  43. hafnia/visualizations/image_visualizations.py +4 -4
  44. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
  45. hafnia-0.4.0.dist-info/RECORD +56 -0
  46. cli/recipe_cmds.py +0 -45
  47. hafnia-0.2.4.dist-info/RECORD +0 -49
  48. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
  49. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
  50. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -29,19 +29,27 @@ HafniaDataset class and a RecipeTransform class in the `data_recipe/recipe_trans
29
29
  that the signatures match.
30
30
  """
31
31
 
32
+ import json
33
+ import re
34
+ import textwrap
32
35
  from pathlib import Path
33
- from typing import TYPE_CHECKING, Callable
36
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
34
37
 
35
38
  import cv2
39
+ import more_itertools
36
40
  import numpy as np
37
41
  import polars as pl
38
42
  from PIL import Image
39
- from tqdm import tqdm
43
+ from rich.progress import track
40
44
 
41
45
  from hafnia.dataset import dataset_helpers
46
+ from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, ColumnName, FieldName
47
+ from hafnia.dataset.primitives import get_primitive_type_from_string
48
+ from hafnia.dataset.primitives.primitive import Primitive
49
+ from hafnia.utils import remove_duplicates_preserve_order
42
50
 
43
- if TYPE_CHECKING:
44
- from hafnia.dataset.hafnia_dataset import HafniaDataset
51
+ if TYPE_CHECKING: # Using 'TYPE_CHECKING' to avoid circular imports during type checking
52
+ from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
45
53
 
46
54
 
47
55
  ### Image transformations ###
@@ -65,7 +73,8 @@ def transform_images(
65
73
  path_image_folder = path_output / "data"
66
74
  path_image_folder.mkdir(parents=True, exist_ok=True)
67
75
 
68
- for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
76
+ org_paths = dataset.samples[ColumnName.FILE_PATH].to_list()
77
+ for org_path in track(org_paths, description="Transform images"):
69
78
  org_path = Path(org_path)
70
79
  if not org_path.exists():
71
80
  raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
@@ -78,5 +87,321 @@ def transform_images(
78
87
  raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
79
88
  new_paths.append(str(new_path))
80
89
 
81
- table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
82
- return dataset.update_table(table)
90
+ table = dataset.samples.with_columns(pl.Series(new_paths).alias(ColumnName.FILE_PATH))
91
+ return dataset.update_samples(table)
92
+
93
+
94
+ def get_task_info_from_task_name_and_primitive(
95
+ tasks: List["TaskInfo"],
96
+ task_name: Optional[str] = None,
97
+ primitive: Union[None, str, Type[Primitive]] = None,
98
+ ) -> "TaskInfo":
99
+ if len(tasks) == 0:
100
+ raise ValueError("Dataset has no tasks defined.")
101
+
102
+ tasks_str = "\n".join([f"\t{task.__repr__()}" for task in tasks])
103
+ if task_name is None and primitive is None:
104
+ if len(tasks) == 1:
105
+ return tasks[0]
106
+ else:
107
+ raise ValueError(
108
+ "For multiple tasks, you will need to specify 'task_name' or 'type_primitive' "
109
+ "to return a unique task. The dataset contains the following tasks: \n" + tasks_str
110
+ )
111
+
112
+ if isinstance(primitive, str):
113
+ primitive = get_primitive_type_from_string(primitive)
114
+
115
+ tasks_filtered = tasks
116
+ if primitive is None:
117
+ tasks_filtered = [task for task in tasks if task.name == task_name]
118
+
119
+ if len(tasks_filtered) == 0:
120
+ raise ValueError(f"No task found with {task_name=}. Available tasks: \n {tasks_str}")
121
+
122
+ unique_primitives = set(task.primitive for task in tasks_filtered)
123
+ if len(unique_primitives) > 1:
124
+ raise ValueError(
125
+ f"Found multiple tasks with {task_name=} using different primitives {unique_primitives=}. "
126
+ "Please specify the primitive type to make it unique. "
127
+ f"The dataset contains the following tasks: \n {tasks_str}"
128
+ )
129
+ primitive = list(unique_primitives)[0]
130
+
131
+ if task_name is None:
132
+ tasks_filtered = [task for task in tasks if task.primitive == primitive]
133
+ if len(tasks_filtered) == 0:
134
+ raise ValueError(f"No task found with {primitive=}. Available tasks: \n {tasks_str}")
135
+
136
+ unique_task_names = set(task.name for task in tasks_filtered)
137
+ if len(unique_task_names) > 1:
138
+ raise ValueError(
139
+ f"Found multiple tasks with {primitive=} using different task names {unique_task_names=}. "
140
+ "Please specify the 'task_name' to make it unique."
141
+ f"The dataset contains the following tasks: \n {tasks_str}"
142
+ )
143
+ task_name = list(unique_task_names)[0]
144
+
145
+ tasks_filtered = [task for task in tasks_filtered if task.primitive == primitive and task.name == task_name]
146
+ if len(tasks_filtered) == 0:
147
+ raise ValueError(f"No task found with {task_name=} and {primitive=}. Available tasks: \n {tasks_str}")
148
+
149
+ if len(tasks_filtered) > 1:
150
+ raise ValueError(
151
+ f"Multiple tasks found with {task_name=} and {primitive=}. "
152
+ f"This should never happen. The dataset contains the following tasks: \n {tasks_str}"
153
+ )
154
+ task = tasks_filtered[0]
155
+ return task
156
+
157
+
158
+ def class_mapper(
159
+ dataset: "HafniaDataset",
160
+ class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
161
+ method: str = "strict",
162
+ primitive: Optional[Type[Primitive]] = None,
163
+ task_name: Optional[str] = None,
164
+ ) -> "HafniaDataset":
165
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
166
+
167
+ if isinstance(class_mapping, list):
168
+ class_mapping = dict(class_mapping)
169
+
170
+ allowed_methods = ("strict", "remove_undefined", "keep_undefined")
171
+ if method not in allowed_methods:
172
+ raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
173
+
174
+ task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=primitive)
175
+ current_names = task.class_names or []
176
+
177
+ # Expand wildcard mappings e.g. {"Vehicle.*": "Vehicle"} to {"Vehicle.Car": "Vehicle", "Vehicle.Bus": "Vehicle"}
178
+ class_mapping = expand_class_mapping(class_mapping, current_names)
179
+
180
+ non_existing_mapping_names = set(class_mapping) - set(current_names)
181
+ if len(non_existing_mapping_names) > 0:
182
+ raise ValueError(
183
+ f"The specified class mapping contains class names {list(non_existing_mapping_names)} "
184
+ f"that do not exist in the dataset task '{task.name}'. "
185
+ f"Available class names: {current_names}"
186
+ )
187
+
188
+ missing_class_names = [c for c in current_names if c not in class_mapping] # List-comprehension to preserve order
189
+ class_mapping = class_mapping.copy()
190
+ if method == "strict":
191
+ pass # Continue to strict mapping below
192
+ elif method == "remove_undefined":
193
+ for missing_class_name in missing_class_names:
194
+ class_mapping[missing_class_name] = OPS_REMOVE_CLASS
195
+ elif method == "keep_undefined":
196
+ for missing_class_name in missing_class_names:
197
+ class_mapping[missing_class_name] = missing_class_name
198
+ else:
199
+ raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
200
+
201
+ missing_class_names = [c for c in current_names if c not in class_mapping]
202
+ if len(missing_class_names) > 0:
203
+ error_msg = f"""\
204
+ The specified class mapping is not a strict mapping - meaning that all class names have not
205
+ been mapped to a new class name.
206
+ In the current mapping, the following classes {list(missing_class_names)} have not been mapped.
207
+ The currently specified mapping is:
208
+ {json.dumps(class_mapping, indent=2)}
209
+ A strict mapping will replace all old class names (dictionary keys) to new class names (dictionary values).
210
+ Please update the mapping to include all class names from the dataset task '{task.name}'.
211
+ To keep class map to the same name e.g. 'person' = 'person'
212
+ or remove class by using the '__REMOVE__' key, e.g. 'person': '__REMOVE__'."""
213
+ raise ValueError(textwrap.dedent(error_msg))
214
+
215
+ new_class_names = remove_duplicates_preserve_order(class_mapping.values())
216
+
217
+ if OPS_REMOVE_CLASS in new_class_names:
218
+ # Move __REMOVE__ to the end of the list if it exists
219
+ new_class_names.append(new_class_names.pop(new_class_names.index(OPS_REMOVE_CLASS)))
220
+
221
+ samples = dataset.samples
222
+ samples_updated = samples.with_columns(
223
+ pl.col(task.primitive.column_name())
224
+ .list.eval(
225
+ pl.element().struct.with_fields(
226
+ pl.when(pl.field(FieldName.TASK_NAME) == task.name)
227
+ .then(pl.field(FieldName.CLASS_NAME).replace_strict(class_mapping))
228
+ .otherwise(pl.field(FieldName.CLASS_NAME))
229
+ .alias(FieldName.CLASS_NAME)
230
+ )
231
+ )
232
+ .alias(task.primitive.column_name())
233
+ )
234
+
235
+ # Update class indices too
236
+ name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
237
+ samples_updated = samples_updated.with_columns(
238
+ pl.col(task.primitive.column_name())
239
+ .list.eval(
240
+ pl.element().struct.with_fields(
241
+ pl.when(pl.field(FieldName.TASK_NAME) == task.name)
242
+ .then(pl.field(FieldName.CLASS_NAME).replace_strict(name_2_idx_mapping))
243
+ .otherwise(pl.field(FieldName.CLASS_IDX))
244
+ .alias(FieldName.CLASS_IDX)
245
+ )
246
+ )
247
+ .alias(task.primitive.column_name())
248
+ )
249
+
250
+ if OPS_REMOVE_CLASS in new_class_names: # Remove class_names that are mapped to REMOVE_CLASS
251
+ samples_updated = samples_updated.with_columns(
252
+ pl.col(task.primitive.column_name())
253
+ .list.filter(pl.element().struct.field(FieldName.CLASS_NAME) != OPS_REMOVE_CLASS)
254
+ .alias(task.primitive.column_name())
255
+ )
256
+
257
+ new_class_names = [c for c in new_class_names if c != OPS_REMOVE_CLASS]
258
+
259
+ new_task = task.model_copy(deep=True)
260
+ new_task.class_names = new_class_names
261
+ dataset_info = dataset.info.replace_task(old_task=task, new_task=new_task)
262
+ return HafniaDataset(info=dataset_info, samples=samples_updated)
263
+
264
+
265
+ def expand_class_mapping(wildcard_mapping: Dict[str, str], class_names: List[str]) -> Dict[str, str]:
266
+ """
267
+ Expand a wildcard class mapping to a full explicit mapping.
268
+
269
+ This function takes a mapping that may contain wildcard patterns (using '*')
270
+ and expands them to match actual class names from a dataset. Exact matches
271
+ take precedence over wildcard patterns.
272
+
273
+ Examples:
274
+ >>> from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS
275
+ >>> wildcard_mapping = {
276
+ ... "Person": "Person",
277
+ ... "Vehicle.*": "Vehicle",
278
+ ... "Vehicle.Trailer": OPS_REMOVE_CLASS
279
+ ... }
280
+ >>> class_names = [
281
+ ... "Person", "Vehicle.Car", "Vehicle.Trailer", "Vehicle.Bus", "Animal.Dog"
282
+ ... ]
283
+ >>> result = expand_wildcard_mapping(wildcard_mapping, class_names)
284
+ >>> print(result)
285
+ {
286
+ "Person": "Person",
287
+ "Vehicle.Car": "Vehicle",
288
+ "Vehicle.Trailer": OPS_REMOVE_CLASS, # Exact match overrides wildcard
289
+ "Vehicle.Bus": "Vehicle",
290
+ # Note: "Animal.Dog" is not included as it doesn't match any pattern
291
+ }
292
+ """
293
+ expanded_mapping = {}
294
+ for match_pattern, mapping_value in wildcard_mapping.items():
295
+ if "*" in match_pattern:
296
+ # Convert wildcard pattern to regex: Escape special regex characters except *, then replace * with .*
297
+ regex_pattern = re.escape(match_pattern).replace("\\*", ".*")
298
+ class_names_matched = [cn for cn in class_names if re.fullmatch(regex_pattern, cn)]
299
+ expanded_mapping.update({cn: mapping_value for cn in class_names_matched})
300
+ else:
301
+ expanded_mapping.pop(match_pattern, None)
302
+ expanded_mapping[match_pattern] = mapping_value
303
+ return expanded_mapping
304
+
305
+
306
+ def rename_task(
307
+ dataset: "HafniaDataset",
308
+ old_task_name: str,
309
+ new_task_name: str,
310
+ ) -> "HafniaDataset":
311
+ from hafnia.dataset.hafnia_dataset import HafniaDataset
312
+
313
+ old_task = dataset.info.get_task_by_name(task_name=old_task_name)
314
+ new_task = old_task.model_copy(deep=True)
315
+ new_task.name = new_task_name
316
+ samples = dataset.samples.with_columns(
317
+ pl.col(old_task.primitive.column_name())
318
+ .list.eval(
319
+ pl.element().struct.with_fields(
320
+ pl.field(FieldName.TASK_NAME).replace(old_task.name, new_task.name).alias(FieldName.TASK_NAME)
321
+ )
322
+ )
323
+ .alias(new_task.primitive.column_name())
324
+ )
325
+
326
+ dataset_info = dataset.info.replace_task(old_task=old_task, new_task=new_task)
327
+ return HafniaDataset(info=dataset_info, samples=samples)
328
+
329
+
330
+ def select_samples_by_class_name(
331
+ dataset: "HafniaDataset",
332
+ name: Union[List[str], str],
333
+ task_name: Optional[str] = None,
334
+ primitive: Optional[Type[Primitive]] = None,
335
+ ) -> "HafniaDataset":
336
+ task, class_names = _validate_inputs_select_samples_by_class_name(
337
+ dataset=dataset,
338
+ name=name,
339
+ task_name=task_name,
340
+ primitive=primitive,
341
+ )
342
+
343
+ samples = dataset.samples.filter(
344
+ pl.col(task.primitive.column_name())
345
+ .list.eval(
346
+ pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names)
347
+ & (pl.element().struct.field(FieldName.TASK_NAME) == task.name)
348
+ )
349
+ .list.any()
350
+ )
351
+
352
+ dataset_updated = dataset.update_samples(samples)
353
+ return dataset_updated
354
+
355
+
356
+ def _validate_inputs_select_samples_by_class_name(
357
+ dataset: "HafniaDataset",
358
+ name: Union[List[str], str],
359
+ task_name: Optional[str] = None,
360
+ primitive: Optional[Type[Primitive]] = None,
361
+ ) -> Tuple["TaskInfo", List[str]]:
362
+ if isinstance(name, str):
363
+ name = [name]
364
+ names = list(name)
365
+
366
+ # Check that specified names are available in at least one of the tasks
367
+ available_names_across_tasks = set(more_itertools.flatten([t.class_names for t in dataset.info.tasks]))
368
+ missing_class_names_across_tasks = set(names) - available_names_across_tasks
369
+ if len(missing_class_names_across_tasks) > 0:
370
+ raise ValueError(
371
+ f"The specified names {list(names)} have not been found in any of the tasks. "
372
+ f"Available class names: {available_names_across_tasks}"
373
+ )
374
+
375
+ # Auto infer task if task_name and primitive are not provided
376
+ if task_name is None and primitive is None:
377
+ tasks_with_names = [t for t in dataset.info.tasks if set(names).issubset(t.class_names or [])]
378
+ if len(tasks_with_names) == 0:
379
+ raise ValueError(
380
+ f"The specified names {names} have not been found in any of the tasks. "
381
+ f"Available class names: {available_names_across_tasks}"
382
+ )
383
+ if len(tasks_with_names) > 1:
384
+ raise ValueError(
385
+ f"Found multiple tasks containing the specified names {names}. "
386
+ f"Specify either 'task_name' or 'primitive' to only select from one task. "
387
+ f"Tasks containing all provided names: {[t.name for t in tasks_with_names]}"
388
+ )
389
+
390
+ task = tasks_with_names[0]
391
+
392
+ else:
393
+ task = get_task_info_from_task_name_and_primitive(
394
+ tasks=dataset.info.tasks,
395
+ task_name=task_name,
396
+ primitive=primitive,
397
+ )
398
+
399
+ task_class_names = set(task.class_names or [])
400
+ missing_class_names = set(names) - task_class_names
401
+ if len(missing_class_names) > 0:
402
+ raise ValueError(
403
+ f"The specified names {list(missing_class_names)} have not been found for the '{task.name}' task. "
404
+ f"Available class names: {task_class_names}"
405
+ )
406
+
407
+ return task, names
@@ -2,11 +2,12 @@ from pathlib import Path
2
2
  from typing import List, Optional, Type
3
3
 
4
4
  import polars as pl
5
- from tqdm import tqdm
5
+ from rich.progress import track
6
6
 
7
7
  from hafnia.dataset.dataset_names import (
8
8
  FILENAME_ANNOTATIONS_JSONL,
9
9
  FILENAME_ANNOTATIONS_PARQUET,
10
+ ColumnName,
10
11
  FieldName,
11
12
  )
12
13
  from hafnia.dataset.operations import table_transformations
@@ -34,8 +35,12 @@ def create_primitive_table(
34
35
 
35
36
  if keep_sample_data:
36
37
  # Drop other primitive columns to avoid conflicts
37
- drop_columns = set(PRIMITIVE_TYPES) - {PrimitiveType, Classification}
38
- remove_no_object_frames = remove_no_object_frames.drop(*[primitive.column_name() for primitive in drop_columns])
38
+
39
+ drop_columns_primitives = set(PRIMITIVE_TYPES) - {PrimitiveType, Classification}
40
+ drop_columns_names = [primitive.column_name() for primitive in drop_columns_primitives]
41
+ drop_columns_names = [c for c in drop_columns_names if c in remove_no_object_frames.columns]
42
+
43
+ remove_no_object_frames = remove_no_object_frames.drop(drop_columns_names)
39
44
  # Rename columns "height", "width" and "meta" for sample to avoid conflicts with object fields names
40
45
  remove_no_object_frames = remove_no_object_frames.rename(
41
46
  {"height": "image.height", "width": "image.width", "meta": "image.meta"}
@@ -46,6 +51,38 @@ def create_primitive_table(
46
51
  return objects_df
47
52
 
48
53
 
54
+ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFrame:
55
+ has_same_schema = samples0.schema == samples1.schema
56
+ if not has_same_schema:
57
+ shared_columns = []
58
+ for column_name, column_type in samples0.schema.items():
59
+ if column_name not in samples1.schema:
60
+ continue
61
+
62
+ if column_type != samples1.schema[column_name]:
63
+ continue
64
+ shared_columns.append(column_name)
65
+
66
+ dropped_columns0 = [
67
+ f"{n}[{ctype._string_repr()}]" for n, ctype in samples0.schema.items() if n not in shared_columns
68
+ ]
69
+ dropped_columns1 = [
70
+ f"{n}[{ctype._string_repr()}]" for n, ctype in samples1.schema.items() if n not in shared_columns
71
+ ]
72
+ user_logger.warning(
73
+ "Datasets with different schemas are being merged. "
74
+ "Only the columns with the same name and type will be kept in the merged dataset.\n"
75
+ f"Dropped columns in samples0: {dropped_columns0}\n"
76
+ f"Dropped columns in samples1: {dropped_columns1}\n"
77
+ )
78
+
79
+ samples0 = samples0.select(list(shared_columns))
80
+ samples1 = samples1.select(list(shared_columns))
81
+ merged_samples = pl.concat([samples0, samples1], how="vertical")
82
+ merged_samples = merged_samples.drop(ColumnName.SAMPLE_INDEX).with_row_index(name=ColumnName.SAMPLE_INDEX)
83
+ return merged_samples
84
+
85
+
49
86
  def filter_table_for_class_names(
50
87
  samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
51
88
  ) -> Optional[pl.DataFrame]:
@@ -107,7 +144,7 @@ def split_primitive_columns_by_task_name(
107
144
  return samples_table
108
145
 
109
146
 
110
- def read_table_from_path(path: Path) -> pl.DataFrame:
147
+ def read_samples_from_path(path: Path) -> pl.DataFrame:
111
148
  path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
112
149
  if path_annotations.exists():
113
150
  user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
@@ -125,7 +162,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
125
162
 
126
163
  def check_image_paths(table: pl.DataFrame) -> bool:
127
164
  missing_files = []
128
- for org_path in tqdm(table["file_name"].to_list(), desc="Check image paths"):
165
+ org_paths = table[ColumnName.FILE_PATH].to_list()
166
+ for org_path in track(org_paths, description="Check image paths"):
129
167
  org_path = Path(org_path)
130
168
  if not org_path.exists():
131
169
  missing_files.append(org_path)
@@ -14,3 +14,11 @@ from .utils import class_color_by_name # noqa: F401
14
14
  PRIMITIVE_TYPES: List[Type[Primitive]] = [Bbox, Classification, Polygon, Bitmask]
15
15
  PRIMITIVE_NAME_TO_TYPE = {cls.__name__: cls for cls in PRIMITIVE_TYPES}
16
16
  PRIMITIVE_COLUMN_NAMES: List[str] = [PrimitiveType.column_name() for PrimitiveType in PRIMITIVE_TYPES]
17
+
18
+
19
+ def get_primitive_type_from_string(name: str) -> Type[Primitive]:
20
+ if name not in PRIMITIVE_NAME_TO_TYPE:
21
+ raise ValueError(
22
+ f"Primitive '{name}' is not recognized. Available primitives: {list(PRIMITIVE_NAME_TO_TYPE.keys())}"
23
+ )
24
+ return PRIMITIVE_NAME_TO_TYPE[name]
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  import cv2
6
6
  import numpy as np
7
+ from pydantic import Field
7
8
 
8
9
  from hafnia.dataset.primitives.primitive import Primitive
9
10
  from hafnia.dataset.primitives.utils import (
@@ -17,18 +18,30 @@ from hafnia.dataset.primitives.utils import (
17
18
 
18
19
  class Bbox(Primitive):
19
20
  # Names should match names in FieldName
20
- height: float # Height of the bounding box as a fraction of the image height, e.g. 0.1 for 10% of the image height
21
- width: float # Width of the bounding box as a fraction of the image width, e.g. 0.1 for 10% of the image width
22
- top_left_x: float # X coordinate of top-left corner of Bbox as a fraction of the image width, e.g. 0.1 for 10% of the image width
23
- top_left_y: float # Y coordinate of top-left corner of Bbox as a fraction of the image height, e.g. 0.1 for 10% of the image height
24
- class_name: Optional[str] = None # Class name, e.g. "car"
25
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
26
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
27
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
28
- ground_truth: bool = True # Whether this is ground truth or a prediction
29
-
30
- task_name: str = "" # Task name to support multiple Bbox tasks in the same dataset. "" defaults to "bboxes"
31
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
+ height: float = Field(
22
+ description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
23
+ )
24
+ width: float = Field(
25
+ description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
26
+ )
27
+ top_left_x: float = Field(
28
+ description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
29
+ )
30
+ top_left_y: float = Field(
31
+ description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
32
+ )
33
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
34
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
35
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
36
+ confidence: Optional[float] = Field(
37
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
38
+ )
39
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
40
+
41
+ task_name: str = Field(
42
+ default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
43
+ )
44
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
32
45
 
33
46
  @staticmethod
34
47
  def default_task_name() -> str:
@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional, Tuple
5
5
  import cv2
6
6
  import numpy as np
7
7
  import pycocotools.mask as coco_mask
8
+ from pydantic import Field
8
9
 
10
+ from hafnia.dataset.dataset_names import FieldName
9
11
  from hafnia.dataset.primitives.primitive import Primitive
10
12
  from hafnia.dataset.primitives.utils import (
11
13
  anonymize_by_resizing,
@@ -14,23 +16,33 @@ from hafnia.dataset.primitives.utils import (
14
16
  text_org_from_left_bottom_to_centered,
15
17
  )
16
18
 
19
+ FieldName
20
+
17
21
 
18
22
  class Bitmask(Primitive):
19
23
  # Names should match names in FieldName
20
- top: int # Bitmask top coordinate in pixels
21
- left: int # Bitmask left coordinate in pixels
22
- height: int # Bitmask height of the bounding box in pixels
23
- width: int # Bitmask width of the bounding box in pixels
24
- rleString: str # Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left).
25
- area: Optional[float] = None # Area of the bitmask in pixels is calculated from the RLE string
26
- class_name: Optional[str] = None # This should match the string in 'FieldName.CLASS_NAME'
27
- class_idx: Optional[int] = None # This should match the string in 'FieldName.CLASS_IDX'
28
- object_id: Optional[str] = None # This should match the string in 'FieldName.OBJECT_ID'
29
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
30
- ground_truth: bool = True # Whether this is ground truth or a prediction
31
-
32
- task_name: str = "" # Task name to support multiple Bitmask tasks in the same dataset. "" defaults to "bitmask"
33
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
24
+ top: int = Field(description="Bitmask top coordinate in pixels ")
25
+ left: int = Field(description="Bitmask left coordinate in pixels")
26
+ height: int = Field(description="Bitmask height of the bounding box in pixels")
27
+ width: int = Field(description="Bitmask width of the bounding box in pixels")
28
+ rleString: str = Field(
29
+ description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
30
+ )
31
+ area: Optional[float] = Field(
32
+ default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
33
+ )
34
+ class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
35
+ class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
36
+ object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
37
+ confidence: Optional[float] = Field(
38
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
39
+ )
40
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
41
+
42
+ task_name: str = Field(
43
+ default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
44
+ )
45
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
34
46
 
35
47
  @staticmethod
36
48
  def default_task_name() -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Dict, Optional, Tuple
2
2
 
3
3
  import numpy as np
4
+ from pydantic import Field
4
5
 
5
6
  from hafnia.dataset.primitives.primitive import Primitive
6
7
  from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
@@ -8,14 +9,21 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
8
9
 
9
10
  class Classification(Primitive):
10
11
  # Names should match names in FieldName
11
- class_name: Optional[str] = None # Class name, e.g. "car"
12
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
13
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
14
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
15
- ground_truth: bool = True # Whether this is ground truth or a prediction
12
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
13
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
14
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
15
+ confidence: Optional[float] = Field(
16
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
17
+ )
18
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
16
19
 
17
- task_name: str = "" # To support multiple Classification tasks in the same dataset. "" defaults to "classification"
18
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
20
+ task_name: str = Field(
21
+ default="",
22
+ description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
23
+ )
24
+ meta: Optional[Dict[str, Any]] = Field(
25
+ default=None, description="This can be used to store additional information about the classification"
26
+ )
19
27
 
20
28
  @staticmethod
21
29
  def default_task_name() -> str:
@@ -38,7 +46,7 @@ class Classification(Primitive):
38
46
  text = class_name
39
47
  else:
40
48
  text = f"{self.task_name}: {class_name}"
41
- image = image_visualizations.append_text_below_frame(image, text=text)
49
+ image = image_visualizations.append_text_below_frame(image, text=text, text_size_ratio=0.05)
42
50
 
43
51
  return image
44
52
 
@@ -1,13 +1,17 @@
1
1
  from typing import Any, Tuple
2
2
 
3
- from pydantic import BaseModel
3
+ from pydantic import BaseModel, Field
4
4
 
5
5
  from hafnia.dataset.primitives.utils import clip
6
6
 
7
7
 
8
8
  class Point(BaseModel):
9
- x: float
10
- y: float
9
+ x: float = Field(
10
+ description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
11
+ )
12
+ y: float = Field(
13
+ description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
14
+ )
11
15
 
12
16
  def to_pixel_coordinates(
13
17
  self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True