hafnia 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. cli/__main__.py +3 -1
  2. cli/config.py +43 -3
  3. cli/keychain.py +88 -0
  4. cli/profile_cmds.py +5 -2
  5. hafnia/__init__.py +1 -1
  6. hafnia/dataset/dataset_helpers.py +9 -2
  7. hafnia/dataset/dataset_names.py +130 -16
  8. hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
  9. hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
  10. hafnia/dataset/dataset_upload_helper.py +83 -22
  11. hafnia/dataset/format_conversions/format_image_classification_folder.py +110 -0
  12. hafnia/dataset/format_conversions/format_yolo.py +164 -0
  13. hafnia/dataset/format_conversions/torchvision_datasets.py +287 -0
  14. hafnia/dataset/hafnia_dataset.py +396 -96
  15. hafnia/dataset/operations/dataset_stats.py +84 -73
  16. hafnia/dataset/operations/dataset_transformations.py +116 -47
  17. hafnia/dataset/operations/table_transformations.py +135 -17
  18. hafnia/dataset/primitives/bbox.py +25 -14
  19. hafnia/dataset/primitives/bitmask.py +22 -15
  20. hafnia/dataset/primitives/classification.py +16 -8
  21. hafnia/dataset/primitives/point.py +7 -3
  22. hafnia/dataset/primitives/polygon.py +15 -10
  23. hafnia/dataset/primitives/primitive.py +1 -1
  24. hafnia/dataset/primitives/segmentation.py +12 -9
  25. hafnia/experiment/hafnia_logger.py +0 -9
  26. hafnia/platform/dataset_recipe.py +7 -2
  27. hafnia/platform/datasets.py +5 -9
  28. hafnia/platform/download.py +24 -90
  29. hafnia/torch_helpers.py +12 -12
  30. hafnia/utils.py +17 -0
  31. hafnia/visualizations/image_visualizations.py +3 -1
  32. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/METADATA +11 -9
  33. hafnia-0.4.1.dist-info/RECORD +57 -0
  34. hafnia-0.3.0.dist-info/RECORD +0 -53
  35. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/WHEEL +0 -0
  36. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/entry_points.txt +0 -0
  37. {hafnia-0.3.0.dist-info → hafnia-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,14 @@
1
1
  from pathlib import Path
2
- from typing import List, Optional, Type
2
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Type
3
3
 
4
4
  import polars as pl
5
- from tqdm import tqdm
5
+ from rich.progress import track
6
6
 
7
7
  from hafnia.dataset.dataset_names import (
8
8
  FILENAME_ANNOTATIONS_JSONL,
9
9
  FILENAME_ANNOTATIONS_PARQUET,
10
- ColumnName,
11
- FieldName,
10
+ PrimitiveField,
11
+ SampleField,
12
12
  )
13
13
  from hafnia.dataset.operations import table_transformations
14
14
  from hafnia.dataset.primitives import PRIMITIVE_TYPES
@@ -16,9 +16,15 @@ from hafnia.dataset.primitives.classification import Classification
16
16
  from hafnia.dataset.primitives.primitive import Primitive
17
17
  from hafnia.log import user_logger
18
18
 
19
+ if TYPE_CHECKING:
20
+ from hafnia.dataset.hafnia_dataset import TaskInfo
21
+
19
22
 
20
23
  def create_primitive_table(
21
- samples_table: pl.DataFrame, PrimitiveType: Type[Primitive], keep_sample_data: bool = False
24
+ samples_table: pl.DataFrame,
25
+ PrimitiveType: Type[Primitive],
26
+ keep_sample_data: bool = False,
27
+ task_name: Optional[str] = None,
22
28
  ) -> Optional[pl.DataFrame]:
23
29
  """
24
30
  Returns a DataFrame with objects of the specified primitive type.
@@ -48,6 +54,9 @@ def create_primitive_table(
48
54
  objects_df = remove_no_object_frames.explode(column_name).unnest(column_name)
49
55
  else:
50
56
  objects_df = remove_no_object_frames.select(pl.col(column_name).explode().struct.unnest())
57
+
58
+ if task_name is not None:
59
+ objects_df = objects_df.filter(pl.col(PrimitiveField.TASK_NAME) == task_name)
51
60
  return objects_df
52
61
 
53
62
 
@@ -55,11 +64,12 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
55
64
  has_same_schema = samples0.schema == samples1.schema
56
65
  if not has_same_schema:
57
66
  shared_columns = []
58
- for column_name, column_type in samples0.schema.items():
67
+ for column_name, s0_column_type in samples0.schema.items():
59
68
  if column_name not in samples1.schema:
60
69
  continue
70
+ samples0, samples1 = correction_of_list_struct_primitives(samples0, samples1, column_name)
61
71
 
62
- if column_type != samples1.schema[column_name]:
72
+ if samples0.schema[column_name] != samples1.schema[column_name]:
63
73
  continue
64
74
  shared_columns.append(column_name)
65
75
 
@@ -79,16 +89,58 @@ def merge_samples(samples0: pl.DataFrame, samples1: pl.DataFrame) -> pl.DataFram
79
89
  samples0 = samples0.select(list(shared_columns))
80
90
  samples1 = samples1.select(list(shared_columns))
81
91
  merged_samples = pl.concat([samples0, samples1], how="vertical")
82
- merged_samples = merged_samples.drop(ColumnName.SAMPLE_INDEX).with_row_index(name=ColumnName.SAMPLE_INDEX)
92
+ merged_samples = add_sample_index(merged_samples)
83
93
  return merged_samples
84
94
 
85
95
 
96
+ def correction_of_list_struct_primitives(
97
+ samples0: pl.DataFrame,
98
+ samples1: pl.DataFrame,
99
+ column_name: str,
100
+ ) -> Tuple[pl.DataFrame, pl.DataFrame]:
101
+ """
102
+ Corrects primitive columns (bboxes, polygons etc of type 'list[struct]') by removing non-matching struct fields
103
+ between two datasets. This is useful when merging two datasets with the same primitive (e.g. Bbox), where
104
+ some (less important) field types in the struct differ between the two datasets.
105
+ This issue often occurs with the 'meta' field as different dataset formats may store different metadata information.
106
+ """
107
+ s0_column_type = samples0.schema[column_name]
108
+ s1_column_type = samples1.schema[column_name]
109
+ is_list_structs = s1_column_type == pl.List(pl.Struct) and s0_column_type == pl.List(pl.Struct)
110
+ is_non_matching_types = s1_column_type != s0_column_type
111
+ if is_list_structs and is_non_matching_types: # Only perform correction for list[struct] types that do not match
112
+ s0_fields = set(s0_column_type.inner.fields)
113
+ s1_fields = set(s1_column_type.inner.fields)
114
+ similar_fields = s0_fields.intersection(s1_fields)
115
+ s0_dropped_fields = s0_fields - similar_fields
116
+ if len(s0_dropped_fields) > 0:
117
+ samples0 = samples0.with_columns(
118
+ pl.col(column_name)
119
+ .list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
120
+ .alias(column_name)
121
+ )
122
+ s1_dropped_fields = s1_fields - similar_fields
123
+ if len(s1_dropped_fields) > 0:
124
+ samples1 = samples1.with_columns(
125
+ pl.col(column_name)
126
+ .list.eval(pl.struct([pl.element().struct.field(k.name) for k in similar_fields]))
127
+ .alias(column_name)
128
+ )
129
+ user_logger.warning(
130
+ f"Primitive column '{column_name}' has none-matching fields in the two datasets. "
131
+ f"Dropping fields in samples0: {[f.name for f in s0_dropped_fields]}. "
132
+ f"Dropping fields in samples1: {[f.name for f in s1_dropped_fields]}."
133
+ )
134
+
135
+ return samples0, samples1
136
+
137
+
86
138
  def filter_table_for_class_names(
87
139
  samples_table: pl.DataFrame, class_names: List[str], PrimitiveType: Type[Primitive]
88
140
  ) -> Optional[pl.DataFrame]:
89
141
  table_with_selected_class_names = samples_table.filter(
90
142
  pl.col(PrimitiveType.column_name())
91
- .list.eval(pl.element().struct.field(FieldName.CLASS_NAME).is_in(class_names))
143
+ .list.eval(pl.element().struct.field(PrimitiveField.CLASS_NAME).is_in(class_names))
92
144
  .list.any()
93
145
  )
94
146
 
@@ -100,20 +152,20 @@ def split_primitive_columns_by_task_name(
100
152
  coordinate_types: Optional[List[Type[Primitive]]] = None,
101
153
  ) -> pl.DataFrame:
102
154
  """
103
- Convert Primitive columns such as "objects" (Bbox) into a column for each task name.
104
- For example, if the "objects" column (containing Bbox objects) has tasks "task1" and "task2".
155
+ Convert Primitive columns such as "bboxes" (Bbox) into a column for each task name.
156
+ For example, if the "bboxes" column (containing Bbox objects) has tasks "task1" and "task2".
105
157
 
106
158
 
107
159
  This:
108
160
  ─┬────────────┬─
109
- objects
161
+ bboxes
110
162
  ┆ --- ┆
111
163
  ┆ list[struc ┆
112
164
  ┆ t[11]] ┆
113
165
  ═╪════════════╪═
114
166
  becomes this:
115
167
  ─┬────────────┬────────────┬─
116
- objects. ┆ objects. ┆
168
+ bboxes. ┆ bboxes. ┆
117
169
  ┆ task1 ┆ task2 ┆
118
170
  ┆ --- ┆ --- ┆
119
171
  ┆ list[struc ┆ list[struc ┆
@@ -131,11 +183,11 @@ def split_primitive_columns_by_task_name(
131
183
  if samples_table[col_name].dtype != pl.List(pl.Struct):
132
184
  continue
133
185
 
134
- task_names = samples_table[col_name].explode().struct.field(FieldName.TASK_NAME).unique().to_list()
186
+ task_names = samples_table[col_name].explode().struct.field(PrimitiveField.TASK_NAME).unique().to_list()
135
187
  samples_table = samples_table.with_columns(
136
188
  [
137
189
  pl.col(col_name)
138
- .list.filter(pl.element().struct.field(FieldName.TASK_NAME).eq(task_name))
190
+ .list.filter(pl.element().struct.field(PrimitiveField.TASK_NAME).eq(task_name))
139
191
  .alias(f"{col_name}.{task_name}")
140
192
  for task_name in task_names
141
193
  ]
@@ -144,7 +196,7 @@ def split_primitive_columns_by_task_name(
144
196
  return samples_table
145
197
 
146
198
 
147
- def read_table_from_path(path: Path) -> pl.DataFrame:
199
+ def read_samples_from_path(path: Path) -> pl.DataFrame:
148
200
  path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
149
201
  if path_annotations.exists():
150
202
  user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
@@ -162,7 +214,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
162
214
 
163
215
  def check_image_paths(table: pl.DataFrame) -> bool:
164
216
  missing_files = []
165
- for org_path in tqdm(table["file_name"].to_list(), desc="Check image paths"):
217
+ org_paths = table[SampleField.FILE_PATH].to_list()
218
+ for org_path in track(org_paths, description="Check image paths"):
166
219
  org_path = Path(org_path)
167
220
  if not org_path.exists():
168
221
  missing_files.append(org_path)
@@ -218,3 +271,68 @@ def unnest_classification_tasks(table: pl.DataFrame, strict: bool = True) -> pl.
218
271
 
219
272
  table_out = table_out.with_columns([pl.col(c).list.first() for c in classification_columns])
220
273
  return table_out
274
+
275
+
276
+ def update_class_indices(samples: pl.DataFrame, task: "TaskInfo") -> pl.DataFrame:
277
+ if task.class_names is None or len(task.class_names) == 0:
278
+ raise ValueError(f"Task '{task.name}' does not have defined class names to update class indices.")
279
+
280
+ objs = (
281
+ samples[task.primitive.column_name()]
282
+ .explode()
283
+ .struct.unnest()
284
+ .filter(pl.col(PrimitiveField.TASK_NAME) == task.name)
285
+ )
286
+ expected_class_names = set(objs[PrimitiveField.CLASS_NAME].unique())
287
+ missing_class_names = expected_class_names - set(task.class_names)
288
+ if len(missing_class_names) > 0:
289
+ raise ValueError(
290
+ f"Task '{task.name}' is missing class names: {missing_class_names}. Cannot update class indices."
291
+ )
292
+
293
+ name_2_idx_mapping = {name: idx for idx, name in enumerate(task.class_names)}
294
+
295
+ samples_updated = samples.with_columns(
296
+ pl.col(task.primitive.column_name())
297
+ .list.eval(
298
+ pl.element().struct.with_fields(
299
+ pl.when(pl.field(PrimitiveField.TASK_NAME) == task.name)
300
+ .then(pl.field(PrimitiveField.CLASS_NAME).replace_strict(name_2_idx_mapping, default=-1))
301
+ .otherwise(pl.field(PrimitiveField.CLASS_IDX))
302
+ .alias(PrimitiveField.CLASS_IDX)
303
+ )
304
+ )
305
+ .alias(task.primitive.column_name())
306
+ )
307
+
308
+ return samples_updated
309
+
310
+
311
+ def add_sample_index(samples: pl.DataFrame) -> pl.DataFrame:
312
+ """
313
+ Adds a sample index column to the samples DataFrame.
314
+
315
+ Note: Unlike the built-in 'polars.DataFrame.with_row_count', this function
316
+ always guarantees 'pl.UInt64' type for the index column.
317
+ """
318
+ if SampleField.SAMPLE_INDEX in samples.columns:
319
+ samples = samples.drop(SampleField.SAMPLE_INDEX)
320
+ samples = samples.select(
321
+ pl.int_range(0, pl.count(), dtype=pl.UInt64).alias(SampleField.SAMPLE_INDEX),
322
+ pl.all(),
323
+ )
324
+ return samples
325
+
326
+
327
+ def add_dataset_name_if_missing(table: pl.DataFrame, dataset_name: str) -> pl.DataFrame:
328
+ if SampleField.DATASET_NAME not in table.columns:
329
+ table = table.with_columns(pl.lit(dataset_name).alias(SampleField.DATASET_NAME))
330
+ else:
331
+ table = table.with_columns(
332
+ pl.when(pl.col(SampleField.DATASET_NAME).is_null())
333
+ .then(pl.lit(dataset_name))
334
+ .otherwise(pl.col(SampleField.DATASET_NAME))
335
+ .alias(SampleField.DATASET_NAME)
336
+ )
337
+
338
+ return table
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  import cv2
6
6
  import numpy as np
7
+ from pydantic import Field
7
8
 
8
9
  from hafnia.dataset.primitives.primitive import Primitive
9
10
  from hafnia.dataset.primitives.utils import (
@@ -17,26 +18,36 @@ from hafnia.dataset.primitives.utils import (
17
18
 
18
19
  class Bbox(Primitive):
19
20
  # Names should match names in FieldName
20
- height: float # Height of the bounding box as a fraction of the image height, e.g. 0.1 for 10% of the image height
21
- width: float # Width of the bounding box as a fraction of the image width, e.g. 0.1 for 10% of the image width
22
- top_left_x: float # X coordinate of top-left corner of Bbox as a fraction of the image width, e.g. 0.1 for 10% of the image width
23
- top_left_y: float # Y coordinate of top-left corner of Bbox as a fraction of the image height, e.g. 0.1 for 10% of the image height
24
- class_name: Optional[str] = None # Class name, e.g. "car"
25
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
26
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
27
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
28
- ground_truth: bool = True # Whether this is ground truth or a prediction
29
-
30
- task_name: str = "" # Task name to support multiple Bbox tasks in the same dataset. "" defaults to "bboxes"
31
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
+ height: float = Field(
22
+ description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
23
+ )
24
+ width: float = Field(
25
+ description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
26
+ )
27
+ top_left_x: float = Field(
28
+ description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
29
+ )
30
+ top_left_y: float = Field(
31
+ description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
32
+ )
33
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
34
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
35
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
36
+ confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
37
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
38
+
39
+ task_name: str = Field(
40
+ default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
41
+ )
42
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
32
43
 
33
44
  @staticmethod
34
45
  def default_task_name() -> str:
35
- return "bboxes"
46
+ return "object_detection"
36
47
 
37
48
  @staticmethod
38
49
  def column_name() -> str:
39
- return "objects"
50
+ return "bboxes"
40
51
 
41
52
  def calculate_area(self) -> float:
42
53
  return self.height * self.width
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Tuple
5
5
  import cv2
6
6
  import numpy as np
7
7
  import pycocotools.mask as coco_mask
8
+ from pydantic import Field
8
9
 
9
10
  from hafnia.dataset.primitives.primitive import Primitive
10
11
  from hafnia.dataset.primitives.utils import (
@@ -17,24 +18,30 @@ from hafnia.dataset.primitives.utils import (
17
18
 
18
19
  class Bitmask(Primitive):
19
20
  # Names should match names in FieldName
20
- top: int # Bitmask top coordinate in pixels
21
- left: int # Bitmask left coordinate in pixels
22
- height: int # Bitmask height of the bounding box in pixels
23
- width: int # Bitmask width of the bounding box in pixels
24
- rleString: str # Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left).
25
- area: Optional[float] = None # Area of the bitmask in pixels is calculated from the RLE string
26
- class_name: Optional[str] = None # This should match the string in 'FieldName.CLASS_NAME'
27
- class_idx: Optional[int] = None # This should match the string in 'FieldName.CLASS_IDX'
28
- object_id: Optional[str] = None # This should match the string in 'FieldName.OBJECT_ID'
29
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
30
- ground_truth: bool = True # Whether this is ground truth or a prediction
31
-
32
- task_name: str = "" # Task name to support multiple Bitmask tasks in the same dataset. "" defaults to "bitmask"
33
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
+ top: int = Field(description="Bitmask top coordinate in pixels ")
22
+ left: int = Field(description="Bitmask left coordinate in pixels")
23
+ height: int = Field(description="Bitmask height of the bounding box in pixels")
24
+ width: int = Field(description="Bitmask width of the bounding box in pixels")
25
+ rleString: str = Field(
26
+ description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
27
+ )
28
+ area: Optional[float] = Field(
29
+ default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
30
+ )
31
+ class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
32
+ class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
33
+ object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
34
+ confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
35
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
36
+
37
+ task_name: str = Field(
38
+ default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
39
+ )
40
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
34
41
 
35
42
  @staticmethod
36
43
  def default_task_name() -> str:
37
- return "bitmask"
44
+ return "mask_detection"
38
45
 
39
46
  @staticmethod
40
47
  def column_name() -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Dict, Optional, Tuple
2
2
 
3
3
  import numpy as np
4
+ from pydantic import Field
4
5
 
5
6
  from hafnia.dataset.primitives.primitive import Primitive
6
7
  from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
@@ -8,18 +9,25 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
8
9
 
9
10
  class Classification(Primitive):
10
11
  # Names should match names in FieldName
11
- class_name: Optional[str] = None # Class name, e.g. "car"
12
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
13
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
14
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
15
- ground_truth: bool = True # Whether this is ground truth or a prediction
12
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
13
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
14
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
15
+ confidence: float = Field(
16
+ default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
17
+ )
18
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
16
19
 
17
- task_name: str = "" # To support multiple Classification tasks in the same dataset. "" defaults to "classification"
18
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
20
+ task_name: str = Field(
21
+ default="",
22
+ description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
23
+ )
24
+ meta: Optional[Dict[str, Any]] = Field(
25
+ default=None, description="This can be used to store additional information about the classification"
26
+ )
19
27
 
20
28
  @staticmethod
21
29
  def default_task_name() -> str:
22
- return "classification"
30
+ return "image_classification"
23
31
 
24
32
  @staticmethod
25
33
  def column_name() -> str:
@@ -1,13 +1,17 @@
1
1
  from typing import Any, Tuple
2
2
 
3
- from pydantic import BaseModel
3
+ from pydantic import BaseModel, Field
4
4
 
5
5
  from hafnia.dataset.primitives.utils import clip
6
6
 
7
7
 
8
8
  class Point(BaseModel):
9
- x: float
10
- y: float
9
+ x: float = Field(
10
+ description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
11
+ )
12
+ y: float = Field(
13
+ description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
14
+ )
11
15
 
12
16
  def to_pixel_coordinates(
13
17
  self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
+ from pydantic import Field
5
6
 
6
7
  from hafnia.dataset.primitives.bitmask import Bitmask
7
8
  from hafnia.dataset.primitives.point import Point
@@ -11,15 +12,19 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
11
12
 
12
13
  class Polygon(Primitive):
13
14
  # Names should match names in FieldName
14
- points: List[Point]
15
- class_name: Optional[str] = None # This should match the string in 'FieldName.CLASS_NAME'
16
- class_idx: Optional[int] = None # This should match the string in 'FieldName.CLASS_IDX'
17
- object_id: Optional[str] = None # This should match the string in 'FieldName.OBJECT_ID'
18
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
19
- ground_truth: bool = True # Whether this is ground truth or a prediction
20
-
21
- task_name: str = "" # Task name to support multiple Polygon tasks in the same dataset. "" defaults to "polygon"
22
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
15
+ points: List[Point] = Field(description="List of points defining the polygon")
16
+ class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
17
+ class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
18
+ object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
19
+ confidence: float = Field(default=1.0, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox")
20
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
21
+
22
+ task_name: str = Field(
23
+ default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
24
+ )
25
+ meta: Optional[Dict[str, Any]] = Field(
26
+ default=None, description="This can be used to store additional information about the polygon"
27
+ )
23
28
 
24
29
  @staticmethod
25
30
  def from_list_of_points(
@@ -33,7 +38,7 @@ class Polygon(Primitive):
33
38
 
34
39
  @staticmethod
35
40
  def default_task_name() -> str:
36
- return "polygon"
41
+ return "polygon_detection"
37
42
 
38
43
  @staticmethod
39
44
  def column_name() -> str:
@@ -22,7 +22,7 @@ class Primitive(BaseModel, metaclass=ABCMeta):
22
22
  def column_name() -> str:
23
23
  """
24
24
  Name of field used in hugging face datasets for storing annotations
25
- E.g. "objects" for Bbox.
25
+ E.g. "bboxes" for Bbox.
26
26
  """
27
27
  pass
28
28
 
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
+ from pydantic import Field
5
6
 
6
7
  from hafnia.dataset.primitives.primitive import Primitive
7
8
  from hafnia.dataset.primitives.utils import get_class_name
@@ -9,23 +10,25 @@ from hafnia.visualizations.colors import get_n_colors
9
10
 
10
11
 
11
12
  class Segmentation(Primitive):
12
- # mask: np.ndarray
13
- class_names: Optional[List[str]] = None # This should match the string in 'FieldName.CLASS_NAME'
14
- ground_truth: bool = True # Whether this is ground truth or a prediction
13
+ # WARNING: Segmentation masks have not been fully implemented yet
14
+ class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
15
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
15
16
 
16
- # confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
17
- task_name: str = (
18
- "" # Task name to support multiple Segmentation tasks in the same dataset. "" defaults to "segmentation"
17
+ task_name: str = Field(
18
+ default="",
19
+ description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
20
+ )
21
+ meta: Optional[Dict[str, Any]] = Field(
22
+ default=None, description="This can be used to store additional information about the segmentation"
19
23
  )
20
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
24
 
22
25
  @staticmethod
23
26
  def default_task_name() -> str:
24
- return "segmentation"
27
+ return "semantic_segmentation"
25
28
 
26
29
  @staticmethod
27
30
  def column_name() -> str:
28
- return "segmentation"
31
+ return "segmentations"
29
32
 
30
33
  def calculate_area(self) -> float:
31
34
  raise NotImplementedError()
@@ -12,8 +12,6 @@ import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  from pydantic import BaseModel, field_validator
14
14
 
15
- from hafnia.data.factory import load_dataset
16
- from hafnia.dataset.hafnia_dataset import HafniaDataset
17
15
  from hafnia.log import sys_logger, user_logger
18
16
  from hafnia.utils import is_hafnia_cloud_job, now_as_str
19
17
 
@@ -136,13 +134,6 @@ class HafniaLogger:
136
134
  except Exception as e:
137
135
  user_logger.error(f"Failed to initialize MLflow: {e}")
138
136
 
139
- def load_dataset(self, dataset_name: str) -> HafniaDataset:
140
- """
141
- Load a dataset from the specified path.
142
- """
143
- self.dataset_name = dataset_name
144
- return load_dataset(dataset_name)
145
-
146
137
  def path_local_experiment(self) -> Path:
147
138
  """Get the path for local experiment."""
148
139
  if is_hafnia_cloud_job():
@@ -11,12 +11,17 @@ from hafnia.utils import pretty_print_list_as_table, timed
11
11
 
12
12
  @timed("Get or create dataset recipe")
13
13
  def get_or_create_dataset_recipe(
14
- recipe: dict, endpoint: str, api_key: str, name: Optional[str] = None
14
+ recipe: dict,
15
+ endpoint: str,
16
+ api_key: str,
17
+ name: Optional[str] = None,
18
+ overwrite: bool = False,
15
19
  ) -> Optional[Dict]:
16
20
  headers = {"Authorization": api_key}
17
- data = {"template": {"body": recipe}}
21
+ data = {"template": {"body": recipe}, "overwrite": overwrite}
18
22
  if name is not None:
19
23
  data["name"] = name # type: ignore[assignment]
24
+
20
25
  response = http.post(endpoint, headers=headers, data=data)
21
26
  return response
22
27
 
@@ -9,11 +9,11 @@ from typing import Any, Dict, List, Optional
9
9
 
10
10
  import rich
11
11
  from rich import print as rprint
12
- from tqdm import tqdm
12
+ from rich.progress import track
13
13
 
14
14
  from cli.config import Config
15
15
  from hafnia import http, utils
16
- from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED, ColumnName
16
+ from hafnia.dataset.dataset_names import DATASET_FILENAMES_REQUIRED
17
17
  from hafnia.dataset.dataset_recipe.dataset_recipe import (
18
18
  DatasetRecipe,
19
19
  get_dataset_path_from_recipe,
@@ -120,15 +120,11 @@ def download_dataset_from_access_endpoint(
120
120
  return
121
121
  dataset = HafniaDataset.from_path(path_dataset, check_for_images=False)
122
122
  try:
123
- fast_copy_files_s3(
124
- src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
125
- dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
126
- append_envs=envs,
127
- description="Downloading images",
128
- )
123
+ dataset = dataset.download_files_aws(path_dataset, aws_credentials=resource_credentials, force_redownload=True)
129
124
  except ValueError as e:
130
125
  user_logger.error(f"Failed to download images: {e}")
131
126
  return
127
+ dataset.write_annotations(path_folder=path_dataset) # Overwrite annotations as files have been re-downloaded
132
128
 
133
129
 
134
130
  def fast_copy_files_s3(
@@ -196,7 +192,7 @@ def execute_s5cmd_commands(
196
192
 
197
193
  error_lines = []
198
194
  lines = []
199
- for line in tqdm(process.stdout, total=len(commands), desc=description):
195
+ for line in track(process.stdout, total=len(commands), description=description):
200
196
  if "ERROR" in line or "error" in line:
201
197
  error_lines.append(line.strip())
202
198
  lines.append(line.strip())