hafnia 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. cli/__main__.py +3 -1
  2. cli/config.py +43 -3
  3. cli/keychain.py +88 -0
  4. cli/profile_cmds.py +5 -2
  5. hafnia/__init__.py +1 -1
  6. hafnia/dataset/dataset_helpers.py +9 -2
  7. hafnia/dataset/dataset_names.py +2 -1
  8. hafnia/dataset/dataset_recipe/dataset_recipe.py +49 -37
  9. hafnia/dataset/dataset_recipe/recipe_transforms.py +18 -2
  10. hafnia/dataset/dataset_upload_helper.py +60 -4
  11. hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
  12. hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
  13. hafnia/dataset/hafnia_dataset.py +176 -50
  14. hafnia/dataset/operations/dataset_stats.py +2 -3
  15. hafnia/dataset/operations/dataset_transformations.py +19 -15
  16. hafnia/dataset/operations/table_transformations.py +4 -3
  17. hafnia/dataset/primitives/bbox.py +25 -12
  18. hafnia/dataset/primitives/bitmask.py +26 -14
  19. hafnia/dataset/primitives/classification.py +16 -8
  20. hafnia/dataset/primitives/point.py +7 -3
  21. hafnia/dataset/primitives/polygon.py +16 -9
  22. hafnia/dataset/primitives/segmentation.py +10 -7
  23. hafnia/experiment/hafnia_logger.py +0 -9
  24. hafnia/platform/dataset_recipe.py +7 -2
  25. hafnia/platform/datasets.py +3 -3
  26. hafnia/platform/download.py +23 -18
  27. hafnia/utils.py +17 -0
  28. hafnia/visualizations/image_visualizations.py +1 -1
  29. {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/METADATA +8 -6
  30. hafnia-0.4.0.dist-info/RECORD +56 -0
  31. hafnia-0.3.0.dist-info/RECORD +0 -53
  32. {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
  33. {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
  34. {hafnia-0.3.0.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -33,17 +33,17 @@ import json
33
33
  import re
34
34
  import textwrap
35
35
  from pathlib import Path
36
- from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Tuple, Type, Union
36
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
37
37
 
38
38
  import cv2
39
39
  import more_itertools
40
40
  import numpy as np
41
41
  import polars as pl
42
42
  from PIL import Image
43
- from tqdm import tqdm
43
+ from rich.progress import track
44
44
 
45
45
  from hafnia.dataset import dataset_helpers
46
- from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, FieldName
46
+ from hafnia.dataset.dataset_names import OPS_REMOVE_CLASS, ColumnName, FieldName
47
47
  from hafnia.dataset.primitives import get_primitive_type_from_string
48
48
  from hafnia.dataset.primitives.primitive import Primitive
49
49
  from hafnia.utils import remove_duplicates_preserve_order
@@ -73,7 +73,8 @@ def transform_images(
73
73
  path_image_folder = path_output / "data"
74
74
  path_image_folder.mkdir(parents=True, exist_ok=True)
75
75
 
76
- for org_path in tqdm(dataset.samples["file_name"].to_list(), desc="Transform images"):
76
+ org_paths = dataset.samples[ColumnName.FILE_PATH].to_list()
77
+ for org_path in track(org_paths, description="Transform images"):
77
78
  org_path = Path(org_path)
78
79
  if not org_path.exists():
79
80
  raise FileNotFoundError(f"File {org_path} does not exist in the dataset.")
@@ -86,7 +87,7 @@ def transform_images(
86
87
  raise FileNotFoundError(f"Transformed file {new_path} does not exist in the dataset.")
87
88
  new_paths.append(str(new_path))
88
89
 
89
- table = dataset.samples.with_columns(pl.Series(new_paths).alias("file_name"))
90
+ table = dataset.samples.with_columns(pl.Series(new_paths).alias(ColumnName.FILE_PATH))
90
91
  return dataset.update_samples(table)
91
92
 
92
93
 
@@ -156,13 +157,16 @@ def get_task_info_from_task_name_and_primitive(
156
157
 
157
158
  def class_mapper(
158
159
  dataset: "HafniaDataset",
159
- class_mapping: Dict[str, str],
160
+ class_mapping: Union[Dict[str, str], List[Tuple[str, str]]],
160
161
  method: str = "strict",
161
162
  primitive: Optional[Type[Primitive]] = None,
162
163
  task_name: Optional[str] = None,
163
164
  ) -> "HafniaDataset":
164
165
  from hafnia.dataset.hafnia_dataset import HafniaDataset
165
166
 
167
+ if isinstance(class_mapping, list):
168
+ class_mapping = dict(class_mapping)
169
+
166
170
  allowed_methods = ("strict", "remove_undefined", "keep_undefined")
167
171
  if method not in allowed_methods:
168
172
  raise ValueError(f"Method '{method}' is not recognized. Allowed methods are: {allowed_methods}")
@@ -170,7 +174,7 @@ def class_mapper(
170
174
  task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=primitive)
171
175
  current_names = task.class_names or []
172
176
 
173
- # Expand wildcard mappings
177
+ # Expand wildcard mappings e.g. {"Vehicle.*": "Vehicle"} to {"Vehicle.Car": "Vehicle", "Vehicle.Bus": "Vehicle"}
174
178
  class_mapping = expand_class_mapping(class_mapping, current_names)
175
179
 
176
180
  non_existing_mapping_names = set(class_mapping) - set(current_names)
@@ -213,7 +217,6 @@ def class_mapper(
213
217
  if OPS_REMOVE_CLASS in new_class_names:
214
218
  # Move __REMOVE__ to the end of the list if it exists
215
219
  new_class_names.append(new_class_names.pop(new_class_names.index(OPS_REMOVE_CLASS)))
216
- name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
217
220
 
218
221
  samples = dataset.samples
219
222
  samples_updated = samples.with_columns(
@@ -230,6 +233,7 @@ def class_mapper(
230
233
  )
231
234
 
232
235
  # Update class indices too
236
+ name_2_idx_mapping: Dict[str, int] = {name: idx for idx, name in enumerate(new_class_names)}
233
237
  samples_updated = samples_updated.with_columns(
234
238
  pl.col(task.primitive.column_name())
235
239
  .list.eval(
@@ -354,14 +358,14 @@ def _validate_inputs_select_samples_by_class_name(
354
358
  name: Union[List[str], str],
355
359
  task_name: Optional[str] = None,
356
360
  primitive: Optional[Type[Primitive]] = None,
357
- ) -> Tuple["TaskInfo", Set[str]]:
361
+ ) -> Tuple["TaskInfo", List[str]]:
358
362
  if isinstance(name, str):
359
363
  name = [name]
360
- names = set(name)
364
+ names = list(name)
361
365
 
362
366
  # Check that specified names are available in at least one of the tasks
363
367
  available_names_across_tasks = set(more_itertools.flatten([t.class_names for t in dataset.info.tasks]))
364
- missing_class_names_across_tasks = names - available_names_across_tasks
368
+ missing_class_names_across_tasks = set(names) - available_names_across_tasks
365
369
  if len(missing_class_names_across_tasks) > 0:
366
370
  raise ValueError(
367
371
  f"The specified names {list(names)} have not been found in any of the tasks. "
@@ -370,15 +374,15 @@ def _validate_inputs_select_samples_by_class_name(
370
374
 
371
375
  # Auto infer task if task_name and primitive are not provided
372
376
  if task_name is None and primitive is None:
373
- tasks_with_names = [t for t in dataset.info.tasks if names.issubset(t.class_names or [])]
377
+ tasks_with_names = [t for t in dataset.info.tasks if set(names).issubset(t.class_names or [])]
374
378
  if len(tasks_with_names) == 0:
375
379
  raise ValueError(
376
- f"The specified names {list(names)} have not been found in any of the tasks. "
380
+ f"The specified names {names} have not been found in any of the tasks. "
377
381
  f"Available class names: {available_names_across_tasks}"
378
382
  )
379
383
  if len(tasks_with_names) > 1:
380
384
  raise ValueError(
381
- f"Found multiple tasks containing the specified names {list(names)}. "
385
+ f"Found multiple tasks containing the specified names {names}. "
382
386
  f"Specify either 'task_name' or 'primitive' to only select from one task. "
383
387
  f"Tasks containing all provided names: {[t.name for t in tasks_with_names]}"
384
388
  )
@@ -393,7 +397,7 @@ def _validate_inputs_select_samples_by_class_name(
393
397
  )
394
398
 
395
399
  task_class_names = set(task.class_names or [])
396
- missing_class_names = names - task_class_names
400
+ missing_class_names = set(names) - task_class_names
397
401
  if len(missing_class_names) > 0:
398
402
  raise ValueError(
399
403
  f"The specified names {list(missing_class_names)} have not been found for the '{task.name}' task. "
@@ -2,7 +2,7 @@ from pathlib import Path
2
2
  from typing import List, Optional, Type
3
3
 
4
4
  import polars as pl
5
- from tqdm import tqdm
5
+ from rich.progress import track
6
6
 
7
7
  from hafnia.dataset.dataset_names import (
8
8
  FILENAME_ANNOTATIONS_JSONL,
@@ -144,7 +144,7 @@ def split_primitive_columns_by_task_name(
144
144
  return samples_table
145
145
 
146
146
 
147
- def read_table_from_path(path: Path) -> pl.DataFrame:
147
+ def read_samples_from_path(path: Path) -> pl.DataFrame:
148
148
  path_annotations = path / FILENAME_ANNOTATIONS_PARQUET
149
149
  if path_annotations.exists():
150
150
  user_logger.info(f"Reading dataset annotations from Parquet file: {path_annotations}")
@@ -162,7 +162,8 @@ def read_table_from_path(path: Path) -> pl.DataFrame:
162
162
 
163
163
  def check_image_paths(table: pl.DataFrame) -> bool:
164
164
  missing_files = []
165
- for org_path in tqdm(table["file_name"].to_list(), desc="Check image paths"):
165
+ org_paths = table[ColumnName.FILE_PATH].to_list()
166
+ for org_path in track(org_paths, description="Check image paths"):
166
167
  org_path = Path(org_path)
167
168
  if not org_path.exists():
168
169
  missing_files.append(org_path)
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  import cv2
6
6
  import numpy as np
7
+ from pydantic import Field
7
8
 
8
9
  from hafnia.dataset.primitives.primitive import Primitive
9
10
  from hafnia.dataset.primitives.utils import (
@@ -17,18 +18,30 @@ from hafnia.dataset.primitives.utils import (
17
18
 
18
19
  class Bbox(Primitive):
19
20
  # Names should match names in FieldName
20
- height: float # Height of the bounding box as a fraction of the image height, e.g. 0.1 for 10% of the image height
21
- width: float # Width of the bounding box as a fraction of the image width, e.g. 0.1 for 10% of the image width
22
- top_left_x: float # X coordinate of top-left corner of Bbox as a fraction of the image width, e.g. 0.1 for 10% of the image width
23
- top_left_y: float # Y coordinate of top-left corner of Bbox as a fraction of the image height, e.g. 0.1 for 10% of the image height
24
- class_name: Optional[str] = None # Class name, e.g. "car"
25
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
26
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
27
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
28
- ground_truth: bool = True # Whether this is ground truth or a prediction
29
-
30
- task_name: str = "" # Task name to support multiple Bbox tasks in the same dataset. "" defaults to "bboxes"
31
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
+ height: float = Field(
22
+ description="Normalized height of the bounding box (0.0=no height, 1.0=full image height) as a fraction of image height"
23
+ )
24
+ width: float = Field(
25
+ description="Normalized width of the bounding box (0.0=no width, 1.0=full image width) as a fraction of image width"
26
+ )
27
+ top_left_x: float = Field(
28
+ description="Normalized x-coordinate of top-left corner (0.0=left edge, 1.0=right edge) as a fraction of image width"
29
+ )
30
+ top_left_y: float = Field(
31
+ description="Normalized y-coordinate of top-left corner (0.0=top edge, 1.0=bottom edge) as a fraction of image height"
32
+ )
33
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
34
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
35
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
36
+ confidence: Optional[float] = Field(
37
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
38
+ )
39
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
40
+
41
+ task_name: str = Field(
42
+ default="", description="Task name to support multiple Bbox tasks in the same dataset. '' defaults to 'bboxes'"
43
+ )
44
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
32
45
 
33
46
  @staticmethod
34
47
  def default_task_name() -> str:
@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional, Tuple
5
5
  import cv2
6
6
  import numpy as np
7
7
  import pycocotools.mask as coco_mask
8
+ from pydantic import Field
8
9
 
10
+ from hafnia.dataset.dataset_names import FieldName
9
11
  from hafnia.dataset.primitives.primitive import Primitive
10
12
  from hafnia.dataset.primitives.utils import (
11
13
  anonymize_by_resizing,
@@ -14,23 +16,33 @@ from hafnia.dataset.primitives.utils import (
14
16
  text_org_from_left_bottom_to_centered,
15
17
  )
16
18
 
19
+ FieldName
20
+
17
21
 
18
22
  class Bitmask(Primitive):
19
23
  # Names should match names in FieldName
20
- top: int # Bitmask top coordinate in pixels
21
- left: int # Bitmask left coordinate in pixels
22
- height: int # Bitmask height of the bounding box in pixels
23
- width: int # Bitmask width of the bounding box in pixels
24
- rleString: str # Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left).
25
- area: Optional[float] = None # Area of the bitmask in pixels is calculated from the RLE string
26
- class_name: Optional[str] = None # This should match the string in 'FieldName.CLASS_NAME'
27
- class_idx: Optional[int] = None # This should match the string in 'FieldName.CLASS_IDX'
28
- object_id: Optional[str] = None # This should match the string in 'FieldName.OBJECT_ID'
29
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
30
- ground_truth: bool = True # Whether this is ground truth or a prediction
31
-
32
- task_name: str = "" # Task name to support multiple Bitmask tasks in the same dataset. "" defaults to "bitmask"
33
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
24
+ top: int = Field(description="Bitmask top coordinate in pixels ")
25
+ left: int = Field(description="Bitmask left coordinate in pixels")
26
+ height: int = Field(description="Bitmask height of the bounding box in pixels")
27
+ width: int = Field(description="Bitmask width of the bounding box in pixels")
28
+ rleString: str = Field(
29
+ description="Run-length encoding (RLE) string for the bitmask region of size (height, width) at (top, left)."
30
+ )
31
+ area: Optional[float] = Field(
32
+ default=None, description="Area of the bitmask in pixels is calculated from the RLE string"
33
+ )
34
+ class_name: Optional[str] = Field(default=None, description="Class name of the object represented by the bitmask")
35
+ class_idx: Optional[int] = Field(default=None, description="Class index of the object represented by the bitmask")
36
+ object_id: Optional[str] = Field(default=None, description="Object ID of the instance represented by the bitmask")
37
+ confidence: Optional[float] = Field(
38
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
39
+ )
40
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
41
+
42
+ task_name: str = Field(
43
+ default="", description="Task name to support multiple Bitmask tasks in the same dataset. Defaults to 'bitmask'"
44
+ )
45
+ meta: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata for the annotation")
34
46
 
35
47
  @staticmethod
36
48
  def default_task_name() -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Dict, Optional, Tuple
2
2
 
3
3
  import numpy as np
4
+ from pydantic import Field
4
5
 
5
6
  from hafnia.dataset.primitives.primitive import Primitive
6
7
  from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_name
@@ -8,14 +9,21 @@ from hafnia.dataset.primitives.utils import anonymize_by_resizing, get_class_nam
8
9
 
9
10
  class Classification(Primitive):
10
11
  # Names should match names in FieldName
11
- class_name: Optional[str] = None # Class name, e.g. "car"
12
- class_idx: Optional[int] = None # Class index, e.g. 0 for "car" if it is the first class
13
- object_id: Optional[str] = None # Unique identifier for the object, e.g. "12345123"
14
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
15
- ground_truth: bool = True # Whether this is ground truth or a prediction
16
-
17
- task_name: str = "" # To support multiple Classification tasks in the same dataset. "" defaults to "classification"
18
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
12
+ class_name: Optional[str] = Field(default=None, description="Class name, e.g. 'car'")
13
+ class_idx: Optional[int] = Field(default=None, description="Class index, e.g. 0 for 'car' if it is the first class")
14
+ object_id: Optional[str] = Field(default=None, description="Unique identifier for the object, e.g. '12345123'")
15
+ confidence: Optional[float] = Field(
16
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification"
17
+ )
18
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
19
+
20
+ task_name: str = Field(
21
+ default="",
22
+ description="To support multiple Classification tasks in the same dataset. '' defaults to 'classification'",
23
+ )
24
+ meta: Optional[Dict[str, Any]] = Field(
25
+ default=None, description="This can be used to store additional information about the classification"
26
+ )
19
27
 
20
28
  @staticmethod
21
29
  def default_task_name() -> str:
@@ -1,13 +1,17 @@
1
1
  from typing import Any, Tuple
2
2
 
3
- from pydantic import BaseModel
3
+ from pydantic import BaseModel, Field
4
4
 
5
5
  from hafnia.dataset.primitives.utils import clip
6
6
 
7
7
 
8
8
  class Point(BaseModel):
9
- x: float
10
- y: float
9
+ x: float = Field(
10
+ description="Normalized x-coordinate (0.0=left edge, 1.0=right edge) relative to image width",
11
+ )
12
+ y: float = Field(
13
+ description="Normalized y-coordinate (0.0=top edge, 1.0=bottom edge) relative to image height",
14
+ )
11
15
 
12
16
  def to_pixel_coordinates(
13
17
  self, image_shape: Tuple[int, int], as_int: bool = True, clip_values: bool = True
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
+ from pydantic import Field
5
6
 
6
7
  from hafnia.dataset.primitives.bitmask import Bitmask
7
8
  from hafnia.dataset.primitives.point import Point
@@ -11,15 +12,21 @@ from hafnia.dataset.primitives.utils import class_color_by_name, get_class_name
11
12
 
12
13
  class Polygon(Primitive):
13
14
  # Names should match names in FieldName
14
- points: List[Point]
15
- class_name: Optional[str] = None # This should match the string in 'FieldName.CLASS_NAME'
16
- class_idx: Optional[int] = None # This should match the string in 'FieldName.CLASS_IDX'
17
- object_id: Optional[str] = None # This should match the string in 'FieldName.OBJECT_ID'
18
- confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox
19
- ground_truth: bool = True # Whether this is ground truth or a prediction
20
-
21
- task_name: str = "" # Task name to support multiple Polygon tasks in the same dataset. "" defaults to "polygon"
22
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
15
+ points: List[Point] = Field(description="List of points defining the polygon")
16
+ class_name: Optional[str] = Field(default=None, description="Class name of the polygon")
17
+ class_idx: Optional[int] = Field(default=None, description="Class index of the polygon")
18
+ object_id: Optional[str] = Field(default=None, description="Object ID of the polygon")
19
+ confidence: Optional[float] = Field(
20
+ default=None, description="Confidence score (0-1.0) for the primitive, e.g. 0.95 for Bbox"
21
+ )
22
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
23
+
24
+ task_name: str = Field(
25
+ default="", description="Task name to support multiple Polygon tasks in the same dataset. Defaults to 'polygon'"
26
+ )
27
+ meta: Optional[Dict[str, Any]] = Field(
28
+ default=None, description="This can be used to store additional information about the polygon"
29
+ )
23
30
 
24
31
  @staticmethod
25
32
  def from_list_of_points(
@@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
+ from pydantic import Field
5
6
 
6
7
  from hafnia.dataset.primitives.primitive import Primitive
7
8
  from hafnia.dataset.primitives.utils import get_class_name
@@ -9,15 +10,17 @@ from hafnia.visualizations.colors import get_n_colors
9
10
 
10
11
 
11
12
  class Segmentation(Primitive):
12
- # mask: np.ndarray
13
- class_names: Optional[List[str]] = None # This should match the string in 'FieldName.CLASS_NAME'
14
- ground_truth: bool = True # Whether this is ground truth or a prediction
13
+ # WARNING: Segmentation masks have not been fully implemented yet
14
+ class_names: Optional[List[str]] = Field(default=None, description="Class names of the segmentation")
15
+ ground_truth: bool = Field(default=True, description="Whether this is ground truth or a prediction")
15
16
 
16
- # confidence: Optional[float] = None # Confidence score (0-1.0) for the primitive, e.g. 0.95 for Classification
17
- task_name: str = (
18
- "" # Task name to support multiple Segmentation tasks in the same dataset. "" defaults to "segmentation"
17
+ task_name: str = Field(
18
+ default="",
19
+ description="Task name to support multiple Segmentation tasks in the same dataset. Defaults to 'segmentation'",
20
+ )
21
+ meta: Optional[Dict[str, Any]] = Field(
22
+ default=None, description="This can be used to store additional information about the segmentation"
19
23
  )
20
- meta: Optional[Dict[str, Any]] = None # This can be used to store additional information about the bitmask
21
24
 
22
25
  @staticmethod
23
26
  def default_task_name() -> str:
@@ -12,8 +12,6 @@ import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  from pydantic import BaseModel, field_validator
14
14
 
15
- from hafnia.data.factory import load_dataset
16
- from hafnia.dataset.hafnia_dataset import HafniaDataset
17
15
  from hafnia.log import sys_logger, user_logger
18
16
  from hafnia.utils import is_hafnia_cloud_job, now_as_str
19
17
 
@@ -136,13 +134,6 @@ class HafniaLogger:
136
134
  except Exception as e:
137
135
  user_logger.error(f"Failed to initialize MLflow: {e}")
138
136
 
139
- def load_dataset(self, dataset_name: str) -> HafniaDataset:
140
- """
141
- Load a dataset from the specified path.
142
- """
143
- self.dataset_name = dataset_name
144
- return load_dataset(dataset_name)
145
-
146
137
  def path_local_experiment(self) -> Path:
147
138
  """Get the path for local experiment."""
148
139
  if is_hafnia_cloud_job():
@@ -11,12 +11,17 @@ from hafnia.utils import pretty_print_list_as_table, timed
11
11
 
12
12
  @timed("Get or create dataset recipe")
13
13
  def get_or_create_dataset_recipe(
14
- recipe: dict, endpoint: str, api_key: str, name: Optional[str] = None
14
+ recipe: dict,
15
+ endpoint: str,
16
+ api_key: str,
17
+ name: Optional[str] = None,
18
+ overwrite: bool = False,
15
19
  ) -> Optional[Dict]:
16
20
  headers = {"Authorization": api_key}
17
- data = {"template": {"body": recipe}}
21
+ data = {"template": {"body": recipe}, "overwrite": overwrite}
18
22
  if name is not None:
19
23
  data["name"] = name # type: ignore[assignment]
24
+
20
25
  response = http.post(endpoint, headers=headers, data=data)
21
26
  return response
22
27
 
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
9
9
 
10
10
  import rich
11
11
  from rich import print as rprint
12
- from tqdm import tqdm
12
+ from rich.progress import track
13
13
 
14
14
  from cli.config import Config
15
15
  from hafnia import http, utils
@@ -122,7 +122,7 @@ def download_dataset_from_access_endpoint(
122
122
  try:
123
123
  fast_copy_files_s3(
124
124
  src_paths=dataset.samples[ColumnName.REMOTE_PATH].to_list(),
125
- dst_paths=dataset.samples[ColumnName.FILE_NAME].to_list(),
125
+ dst_paths=dataset.samples[ColumnName.FILE_PATH].to_list(),
126
126
  append_envs=envs,
127
127
  description="Downloading images",
128
128
  )
@@ -196,7 +196,7 @@ def execute_s5cmd_commands(
196
196
 
197
197
  error_lines = []
198
198
  lines = []
199
- for line in tqdm(process.stdout, total=len(commands), desc=description):
199
+ for line in track(process.stdout, total=len(commands), description=description):
200
200
  if "ERROR" in line or "error" in line:
201
201
  error_lines.append(line.strip())
202
202
  lines.append(line.strip())
@@ -1,10 +1,10 @@
1
1
  from pathlib import Path
2
- from typing import Dict
2
+ from typing import Dict, Optional
3
3
 
4
4
  import boto3
5
5
  from botocore.exceptions import ClientError
6
6
  from pydantic import BaseModel, field_validator
7
- from tqdm import tqdm
7
+ from rich.progress import Progress
8
8
 
9
9
  from hafnia.http import fetch
10
10
  from hafnia.log import sys_logger, user_logger
@@ -125,13 +125,15 @@ def download_single_object(s3_client, bucket: str, object_key: str, output_dir:
125
125
  return local_path
126
126
 
127
127
 
128
- def download_resource(resource_url: str, destination: str, api_key: str) -> Dict:
128
+ def download_resource(resource_url: str, destination: str, api_key: str, prefix: Optional[str] = None) -> Dict:
129
129
  """
130
130
  Downloads either a single file from S3 or all objects under a prefix.
131
131
 
132
132
  Args:
133
133
  resource_url (str): The URL or identifier used to fetch S3 credentials.
134
134
  destination (str): Path to local directory where files will be stored.
135
+ api_key (str): API key for authentication when fetching credentials.
136
+ prefix (Optional[str]): If provided, only download objects under this prefix.
135
137
 
136
138
  Returns:
137
139
  Dict[str, Any]: A dictionary containing download info, e.g.:
@@ -147,7 +149,7 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
147
149
  res_credentials = get_resource_credentials(resource_url, api_key)
148
150
 
149
151
  bucket_name = res_credentials.bucket_name()
150
- key = res_credentials.object_key()
152
+ prefix = prefix or res_credentials.object_key()
151
153
 
152
154
  output_path = Path(destination)
153
155
  output_path.mkdir(parents=True, exist_ok=True)
@@ -159,29 +161,32 @@ def download_resource(resource_url: str, destination: str, api_key: str) -> Dict
159
161
  )
160
162
  downloaded_files = []
161
163
  try:
162
- s3_client.head_object(Bucket=bucket_name, Key=key)
163
- local_file = download_single_object(s3_client, bucket_name, key, output_path)
164
+ s3_client.head_object(Bucket=bucket_name, Key=prefix)
165
+ local_file = download_single_object(s3_client, bucket_name, prefix, output_path)
164
166
  downloaded_files.append(str(local_file))
165
167
  user_logger.info(f"Downloaded single file: {local_file}")
166
168
 
167
169
  except ClientError as e:
168
170
  error_code = e.response.get("Error", {}).get("Code")
169
171
  if error_code == "404":
170
- sys_logger.debug(f"Object '{key}' not found; trying as a prefix.")
171
- response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=key)
172
+ sys_logger.debug(f"Object '{prefix}' not found; trying as a prefix.")
173
+ response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
172
174
  contents = response.get("Contents", [])
173
175
 
174
176
  if not contents:
175
- raise ValueError(f"No objects found for prefix '{key}' in bucket '{bucket_name}'")
176
- pbar = tqdm(contents)
177
- for obj in pbar:
178
- sub_key = obj["Key"]
179
- size_mb = obj.get("Size", 0) / 1024 / 1024
180
- pbar.set_description(f"{sub_key} ({size_mb:.2f} MB)")
181
- local_file = download_single_object(s3_client, bucket_name, sub_key, output_path)
182
- downloaded_files.append(local_file.as_posix())
183
-
184
- user_logger.info(f"Downloaded folder/prefix '{key}' with {len(downloaded_files)} object(s).")
177
+ raise ValueError(f"No objects found for prefix '{prefix}' in bucket '{bucket_name}'")
178
+
179
+ with Progress() as progress:
180
+ task = progress.add_task("Downloading files", total=len(contents))
181
+ for obj in contents:
182
+ sub_key = obj["Key"]
183
+ size_mb = obj.get("Size", 0) / 1024 / 1024
184
+ progress.update(task, description=f"Downloading {sub_key} ({size_mb:.2f} MB)")
185
+ local_file = download_single_object(s3_client, bucket_name, sub_key, output_path)
186
+ downloaded_files.append(local_file.as_posix())
187
+ progress.advance(task)
188
+
189
+ user_logger.info(f"Downloaded folder/prefix '{prefix}' with {len(downloaded_files)} object(s).")
185
190
  else:
186
191
  user_logger.error(f"Error checking object or prefix: {e}")
187
192
  raise RuntimeError(f"Failed to check or download S3 resource: {e}") from e
hafnia/utils.py CHANGED
@@ -63,6 +63,18 @@ def timed(label: str):
63
63
  return decorator
64
64
 
65
65
 
66
+ def get_path_hafnia_cache() -> Path:
67
+ return Path.home() / "hafnia"
68
+
69
+
70
+ def get_path_torchvision_downloads() -> Path:
71
+ return get_path_hafnia_cache() / "torchvision_downloads"
72
+
73
+
74
+ def get_path_hafnia_conversions() -> Path:
75
+ return get_path_hafnia_cache() / "hafnia_conversions"
76
+
77
+
66
78
  def now_as_str() -> str:
67
79
  """Get the current date and time as a string."""
68
80
  return datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
@@ -205,3 +217,8 @@ def remove_duplicates_preserve_order(seq: Iterable) -> List:
205
217
  Remove duplicates from a list while preserving the order of elements.
206
218
  """
207
219
  return list(more_itertools.unique_everseen(seq))
220
+
221
+
222
+ def is_image_file(file_path: Path) -> bool:
223
+ image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif")
224
+ return file_path.suffix.lower() in image_extensions
@@ -193,7 +193,7 @@ def save_dataset_sample_set_visualizations(
193
193
  image = draw_annotations(image, annotations, draw_settings=draw_settings)
194
194
 
195
195
  pil_image = Image.fromarray(image)
196
- path_image = path_output_folder / Path(sample.file_name).name
196
+ path_image = path_output_folder / Path(sample.file_path).name
197
197
  pil_image.save(path_image)
198
198
  paths.append(path_image)
199
199