hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. cli/__main__.py +16 -3
  2. cli/config.py +45 -4
  3. cli/consts.py +1 -1
  4. cli/dataset_cmds.py +6 -14
  5. cli/dataset_recipe_cmds.py +78 -0
  6. cli/experiment_cmds.py +226 -43
  7. cli/keychain.py +88 -0
  8. cli/profile_cmds.py +10 -6
  9. cli/runc_cmds.py +5 -5
  10. cli/trainer_package_cmds.py +65 -0
  11. hafnia/__init__.py +2 -0
  12. hafnia/data/factory.py +1 -2
  13. hafnia/dataset/dataset_helpers.py +9 -14
  14. hafnia/dataset/dataset_names.py +10 -5
  15. hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
  16. hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
  17. hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
  18. hafnia/dataset/dataset_upload_helper.py +265 -56
  19. hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
  20. hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
  21. hafnia/dataset/hafnia_dataset.py +577 -213
  22. hafnia/dataset/license_types.py +63 -0
  23. hafnia/dataset/operations/dataset_stats.py +259 -3
  24. hafnia/dataset/operations/dataset_transformations.py +332 -7
  25. hafnia/dataset/operations/table_transformations.py +43 -5
  26. hafnia/dataset/primitives/__init__.py +8 -0
  27. hafnia/dataset/primitives/bbox.py +25 -12
  28. hafnia/dataset/primitives/bitmask.py +26 -14
  29. hafnia/dataset/primitives/classification.py +16 -8
  30. hafnia/dataset/primitives/point.py +7 -3
  31. hafnia/dataset/primitives/polygon.py +16 -9
  32. hafnia/dataset/primitives/segmentation.py +10 -7
  33. hafnia/experiment/hafnia_logger.py +111 -8
  34. hafnia/http.py +16 -2
  35. hafnia/platform/__init__.py +9 -3
  36. hafnia/platform/builder.py +12 -10
  37. hafnia/platform/dataset_recipe.py +104 -0
  38. hafnia/platform/datasets.py +47 -9
  39. hafnia/platform/download.py +25 -19
  40. hafnia/platform/experiment.py +51 -56
  41. hafnia/platform/trainer_package.py +57 -0
  42. hafnia/utils.py +81 -13
  43. hafnia/visualizations/image_visualizations.py +4 -4
  44. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
  45. hafnia-0.4.0.dist-info/RECORD +56 -0
  46. cli/recipe_cmds.py +0 -45
  47. hafnia-0.2.4.dist-info/RECORD +0 -49
  48. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
  49. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
  50. {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
3
4
  from datetime import datetime
4
5
  from enum import Enum
5
6
  from pathlib import Path
6
- from typing import Dict, List, Optional, Tuple, Type, Union
7
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
7
8
 
8
9
  import boto3
9
10
  import polars as pl
10
- from pydantic import BaseModel, ConfigDict
11
+ from PIL import Image
12
+ from pydantic import BaseModel, ConfigDict, field_validator
11
13
 
12
- import hafnia.dataset.primitives.bbox
13
- import hafnia.dataset.primitives.bitmask
14
- import hafnia.dataset.primitives.classification
15
- import hafnia.dataset.primitives.polygon
16
- import hafnia.dataset.primitives.segmentation
17
14
  from cli.config import Config
18
15
  from hafnia.dataset import primitives
19
16
  from hafnia.dataset.dataset_names import (
@@ -23,11 +20,19 @@ from hafnia.dataset.dataset_names import (
23
20
  FieldName,
24
21
  SplitName,
25
22
  )
26
- from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
23
+ from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
24
+ from hafnia.dataset.operations import table_transformations
25
+ from hafnia.dataset.primitives import (
26
+ Bbox,
27
+ Bitmask,
28
+ Classification,
29
+ Polygon,
30
+ Segmentation,
31
+ )
27
32
  from hafnia.dataset.primitives.primitive import Primitive
28
33
  from hafnia.http import post
29
34
  from hafnia.log import user_logger
30
- from hafnia.platform import get_dataset_id
35
+ from hafnia.platform.datasets import get_dataset_id
31
36
 
32
37
 
33
38
  def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
@@ -47,13 +52,14 @@ class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
47
52
  license_citation: Optional[str] = None
48
53
  version: Optional[str] = None
49
54
  s3_bucket_name: Optional[str] = None
55
+ dataset_format_version: Optional[str] = None
50
56
  annotation_date: Optional[datetime] = None
51
57
  annotation_project_id: Optional[str] = None
52
58
  annotation_dataset_id: Optional[str] = None
53
59
  annotation_ontology: Optional[str] = None
54
60
  dataset_variants: Optional[List[DbDatasetVariant]] = None
55
61
  split_annotations_reports: Optional[List[DbSplitAnnotationsReport]] = None
56
- dataset_images: Optional[List[DatasetImage]] = None
62
+ imgs: Optional[List[DatasetImage]] = None
57
63
 
58
64
 
59
65
  class DbDatasetVariant(BaseModel, validate_assignment=True): # type: ignore[call-arg]
@@ -75,6 +81,8 @@ class DbAnnotatedObject(BaseModel, validate_assignment=True): # type: ignore[ca
75
81
  model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
76
82
  name: str
77
83
  entity_type: EntityTypeChoices
84
+ annotation_type: DbAnnotationType
85
+ task_name: Optional[str] = None # Not sure if adding task_name makes sense.
78
86
 
79
87
 
80
88
  class DbAnnotatedObjectReport(BaseModel, validate_assignment=True): # type: ignore[call-arg]
@@ -82,10 +90,34 @@ class DbAnnotatedObjectReport(BaseModel, validate_assignment=True): # type: ign
82
90
  obj: DbAnnotatedObject
83
91
  unique_obj_ids: Optional[int] = None
84
92
  obj_instances: Optional[int] = None
93
+ images_with_obj: Optional[int] = None
94
+
85
95
  average_count_per_image: Optional[float] = None
86
- avg_area: Optional[float] = None
87
- min_area: Optional[float] = None
88
- max_area: Optional[float] = None
96
+
97
+ area_avg_ratio: Optional[float] = None
98
+ area_min_ratio: Optional[float] = None
99
+ area_max_ratio: Optional[float] = None
100
+
101
+ height_avg_ratio: Optional[float] = None
102
+ height_min_ratio: Optional[float] = None
103
+ height_max_ratio: Optional[float] = None
104
+
105
+ width_avg_ratio: Optional[float] = None
106
+ width_min_ratio: Optional[float] = None
107
+ width_max_ratio: Optional[float] = None
108
+
109
+ area_avg_px: Optional[float] = None
110
+ area_min_px: Optional[int] = None
111
+ area_max_px: Optional[int] = None
112
+
113
+ height_avg_px: Optional[float] = None
114
+ height_min_px: Optional[int] = None
115
+ height_max_px: Optional[int] = None
116
+
117
+ width_avg_px: Optional[float] = None
118
+ width_min_px: Optional[int] = None
119
+ width_max_px: Optional[int] = None
120
+
89
121
  annotation_type: Optional[List[DbAnnotationType]] = None
90
122
 
91
123
 
@@ -155,8 +187,78 @@ class EntityTypeChoices(str, Enum): # Should match `EntityTypeChoices` in `dipd
155
187
  EVENT = "EVENT"
156
188
 
157
189
 
158
- class DatasetImage(BaseModel, validate_assignment=True): # type: ignore[call-arg]
159
- img: str
190
+ class Annotations(BaseModel):
191
+ """
192
+ Used in 'DatasetImageMetadata' for visualizing image annotations
193
+ in gallery images on the dataset detail page.
194
+ """
195
+
196
+ objects: Optional[List[Bbox]] = None
197
+ classifications: Optional[List[Classification]] = None
198
+ polygons: Optional[List[Polygon]] = None
199
+ bitmasks: Optional[List[Bitmask]] = None
200
+
201
+
202
+ class DatasetImageMetadata(BaseModel):
203
+ """
204
+ Metadata for gallery images on the dataset detail page on portal.
205
+ """
206
+
207
+ annotations: Optional[Annotations] = None
208
+ meta: Optional[Dict[str, Any]] = None
209
+
210
+ @classmethod
211
+ def from_sample(cls, sample: Sample) -> "DatasetImageMetadata":
212
+ sample = sample.model_copy(deep=True)
213
+ sample.file_path = "/".join(Path(sample.file_path).parts[-3:])
214
+ metadata = {}
215
+ metadata_field_names = [
216
+ ColumnName.FILE_PATH,
217
+ ColumnName.HEIGHT,
218
+ ColumnName.WIDTH,
219
+ ColumnName.SPLIT,
220
+ ]
221
+ for field_name in metadata_field_names:
222
+ if hasattr(sample, field_name) and getattr(sample, field_name) is not None:
223
+ metadata[field_name] = getattr(sample, field_name)
224
+
225
+ obj = DatasetImageMetadata(
226
+ annotations=Annotations(
227
+ objects=sample.objects,
228
+ classifications=sample.classifications,
229
+ polygons=sample.polygons,
230
+ bitmasks=sample.bitmasks,
231
+ ),
232
+ meta=metadata,
233
+ )
234
+
235
+ return obj
236
+
237
+
238
+ class DatasetImage(Attribution, validate_assignment=True): # type: ignore[call-arg]
239
+ img: str # Base64-encoded image string
240
+ order: Optional[int] = None
241
+ metadata: Optional[DatasetImageMetadata] = None
242
+
243
+ @field_validator("img", mode="before")
244
+ def validate_image_path(cls, v: Union[str, Path]) -> str:
245
+ if isinstance(v, Path):
246
+ v = path_image_to_base64_str(path_image=v)
247
+
248
+ if not isinstance(v, str):
249
+ raise ValueError("Image must be a string or Path object representing the image path.")
250
+
251
+ if not v.startswith("data:image/"):
252
+ raise ValueError("Image must be a base64-encoded data URL.")
253
+
254
+ return v
255
+
256
+
257
+ def path_image_to_base64_str(path_image: Path) -> str:
258
+ image = Image.open(path_image)
259
+ mime_format = Image.MIME[image.format]
260
+ as_b64 = base64.b64encode(path_image.read_bytes()).decode("ascii")
261
+ return f"data:{mime_format};base64,{as_b64}"
160
262
 
161
263
 
162
264
  class DbDistributionType(BaseModel, validate_assignment=True): # type: ignore[call-arg]
@@ -185,7 +287,10 @@ def get_folder_size(path: Path) -> int:
185
287
  return sum([path.stat().st_size for path in path.rglob("*")])
186
288
 
187
289
 
188
- def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset) -> dict:
290
+ def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
291
+ if not upload_gallery_images:
292
+ dataset_update.imgs = None
293
+
189
294
  cfg = Config()
190
295
  dataset_details = dataset_update.model_dump_json()
191
296
  data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
@@ -199,9 +304,9 @@ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
199
304
  import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
200
305
  headers = {"Authorization": cfg.api_key}
201
306
 
202
- user_logger.info("Importing dataset details. This may take up to 30 seconds...")
203
- data = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
204
- return data # type: ignore[return-value]
307
+ user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
308
+ response = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
309
+ return response # type: ignore[return-value]
205
310
 
206
311
 
207
312
  def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
@@ -219,7 +324,6 @@ def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Ty
219
324
  col_name = PrimitiveType.column_name()
220
325
  table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
221
326
  if col_name not in table.columns:
222
- user_logger.warning(f"Warning: No field called '{col_name}' was found for '{PrimitiveType.__name__}'.")
223
327
  return False
224
328
 
225
329
  if table[col_name].dtype == pl.Null:
@@ -235,7 +339,7 @@ def calculate_distribution_values(
235
339
 
236
340
  if len(distribution_tasks) == 0:
237
341
  return []
238
- classification_column = hafnia.dataset.primitives.classification.Classification.column_name()
342
+ classification_column = Classification.column_name()
239
343
  classifications = dataset_split.select(pl.col(classification_column).explode())
240
344
  classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
241
345
  classifications = classifications.filter(
@@ -277,6 +381,8 @@ def dataset_info_from_dataset(
277
381
  deployment_stage: DeploymentStage,
278
382
  path_sample: Optional[Path],
279
383
  path_hidden: Optional[Path],
384
+ path_gallery_images: Optional[Path] = None,
385
+ gallery_image_names: Optional[List[str]] = None,
280
386
  ) -> DbDataset:
281
387
  dataset_variants = []
282
388
  dataset_reports = []
@@ -292,6 +398,12 @@ def dataset_info_from_dataset(
292
398
  if len(path_and_variant) == 0:
293
399
  raise ValueError("At least one path must be provided for sample or hidden dataset.")
294
400
 
401
+ gallery_images = create_gallery_images(
402
+ dataset=dataset,
403
+ path_gallery_images=path_gallery_images,
404
+ gallery_image_names=gallery_image_names,
405
+ )
406
+
295
407
  for path_dataset, variant_type in path_and_variant:
296
408
  if variant_type == DatasetVariant.SAMPLE:
297
409
  dataset_variant = dataset.create_sample_dataset()
@@ -331,19 +443,26 @@ def dataset_info_from_dataset(
331
443
  )
332
444
 
333
445
  object_reports: List[DbAnnotatedObjectReport] = []
334
- primitive_columns = [tPrimtive.column_name() for tPrimtive in primitives.PRIMITIVE_TYPES]
335
- if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bbox.Bbox):
336
- bbox_column_name = hafnia.dataset.primitives.bbox.Bbox.column_name()
337
- drop_columns = [col for col in primitive_columns if col != bbox_column_name]
338
- drop_columns.append(FieldName.META)
339
- df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
340
- df_per_instance = df_per_instance.explode(bbox_column_name).drop(drop_columns).unnest(bbox_column_name)
341
-
446
+ primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
447
+ if has_primitive(dataset_split, PrimitiveType=Bbox):
448
+ df_per_instance = table_transformations.create_primitive_table(
449
+ dataset_split, PrimitiveType=Bbox, keep_sample_data=True
450
+ )
451
+ if df_per_instance is None:
452
+ raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
342
453
  # Calculate area of bounding boxes
343
- df_per_instance = df_per_instance.with_columns((pl.col("height") * pl.col("width")).alias("area"))
454
+ df_per_instance = df_per_instance.with_columns(
455
+ (pl.col("height") * pl.col("width")).alias("area"),
456
+ ).with_columns(
457
+ (pl.col("height") * pl.col("image.height")).alias("height_px"),
458
+ (pl.col("width") * pl.col("image.width")).alias("width_px"),
459
+ (pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
460
+ )
344
461
 
345
462
  annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
346
- for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
463
+ for (class_name, task_name), class_group in df_per_instance.group_by(
464
+ FieldName.CLASS_NAME, FieldName.TASK_NAME
465
+ ):
347
466
  if class_name is None:
348
467
  continue
349
468
  object_reports.append(
@@ -351,25 +470,39 @@ def dataset_info_from_dataset(
351
470
  obj=DbAnnotatedObject(
352
471
  name=class_name,
353
472
  entity_type=EntityTypeChoices.OBJECT.value,
473
+ annotation_type=annotation_type,
474
+ task_name=task_name,
354
475
  ),
355
476
  unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
356
477
  obj_instances=len(class_group),
357
478
  annotation_type=[annotation_type],
358
- avg_area=class_group["area"].mean(),
359
- min_area=class_group["area"].min(),
360
- max_area=class_group["area"].max(),
479
+ images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
480
+ area_avg_ratio=class_group["area"].mean(),
481
+ area_min_ratio=class_group["area"].min(),
482
+ area_max_ratio=class_group["area"].max(),
483
+ height_avg_ratio=class_group["height"].mean(),
484
+ height_min_ratio=class_group["height"].min(),
485
+ height_max_ratio=class_group["height"].max(),
486
+ width_avg_ratio=class_group["width"].mean(),
487
+ width_min_ratio=class_group["width"].min(),
488
+ width_max_ratio=class_group["width"].max(),
489
+ area_avg_px=class_group["area_px"].mean(),
490
+ area_min_px=int(class_group["area_px"].min()),
491
+ area_max_px=int(class_group["area_px"].max()),
492
+ height_avg_px=class_group["height_px"].mean(),
493
+ height_min_px=int(class_group["height_px"].min()),
494
+ height_max_px=int(class_group["height_px"].max()),
495
+ width_avg_px=class_group["width_px"].mean(),
496
+ width_min_px=int(class_group["width_px"].min()),
497
+ width_max_px=int(class_group["width_px"].max()),
361
498
  average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
362
499
  )
363
500
  )
364
501
 
365
- if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.classification.Classification):
502
+ if has_primitive(dataset_split, PrimitiveType=Classification):
366
503
  annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
367
- col_name = hafnia.dataset.primitives.classification.Classification.column_name()
368
- classification_tasks = [
369
- task.name
370
- for task in dataset.info.tasks
371
- if task.primitive == hafnia.dataset.primitives.classification.Classification
372
- ]
504
+ col_name = Classification.column_name()
505
+ classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
373
506
  has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
374
507
  if has_classification_data:
375
508
  classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
@@ -385,7 +518,7 @@ def dataset_info_from_dataset(
385
518
  ), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
386
519
  if class_name is None:
387
520
  continue
388
- if task_name == hafnia.dataset.primitives.classification.Classification.default_task_name():
521
+ if task_name == Classification.default_task_name():
389
522
  display_name = class_name # Prefix class name with task name
390
523
  else:
391
524
  display_name = f"{task_name}.{class_name}"
@@ -394,6 +527,8 @@ def dataset_info_from_dataset(
394
527
  obj=DbAnnotatedObject(
395
528
  name=display_name,
396
529
  entity_type=EntityTypeChoices.EVENT.value,
530
+ annotation_type=annotation_type,
531
+ task_name=task_name,
397
532
  ),
398
533
  unique_obj_ids=len(
399
534
  class_group
@@ -403,22 +538,32 @@ def dataset_info_from_dataset(
403
538
  )
404
539
  )
405
540
 
406
- if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.segmentation.Segmentation):
541
+ if has_primitive(dataset_split, PrimitiveType=Segmentation):
407
542
  raise NotImplementedError("Not Implemented yet")
408
543
 
409
- if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bitmask.Bitmask):
410
- col_name = hafnia.dataset.primitives.bitmask.Bitmask.column_name()
544
+ if has_primitive(dataset_split, PrimitiveType=Bitmask):
545
+ col_name = Bitmask.column_name()
411
546
  drop_columns = [col for col in primitive_columns if col != col_name]
412
547
  drop_columns.append(FieldName.META)
413
- df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
414
- df_per_instance = df_per_instance.explode(col_name).drop(drop_columns).unnest(col_name)
415
548
 
416
- min_area = df_per_instance["area"].min() if "area" in df_per_instance.columns else None
417
- max_area = df_per_instance["area"].max() if "area" in df_per_instance.columns else None
418
- avg_area = df_per_instance["area"].mean() if "area" in df_per_instance.columns else None
549
+ df_per_instance = table_transformations.create_primitive_table(
550
+ dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
551
+ )
552
+ if df_per_instance is None:
553
+ raise ValueError(
554
+ f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
555
+ )
556
+ df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
557
+ df_per_instance = df_per_instance.with_columns(
558
+ (pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
559
+ (pl.col("height_px") / pl.col("image.height")).alias("height"),
560
+ (pl.col("width_px") / pl.col("image.width")).alias("width"),
561
+ )
419
562
 
420
563
  annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
421
- for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
564
+ for (class_name, task_name), class_group in df_per_instance.group_by(
565
+ FieldName.CLASS_NAME, FieldName.TASK_NAME
566
+ ):
422
567
  if class_name is None:
423
568
  continue
424
569
  object_reports.append(
@@ -426,18 +571,36 @@ def dataset_info_from_dataset(
426
571
  obj=DbAnnotatedObject(
427
572
  name=class_name,
428
573
  entity_type=EntityTypeChoices.OBJECT.value,
574
+ annotation_type=annotation_type,
575
+ task_name=task_name,
429
576
  ),
430
577
  unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
431
578
  obj_instances=len(class_group),
432
579
  annotation_type=[annotation_type],
433
580
  average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
434
- avg_area=avg_area,
435
- min_area=min_area,
436
- max_area=max_area,
581
+ images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
582
+ area_avg_ratio=class_group["area"].mean(),
583
+ area_min_ratio=class_group["area"].min(),
584
+ area_max_ratio=class_group["area"].max(),
585
+ height_avg_ratio=class_group["height"].mean(),
586
+ height_min_ratio=class_group["height"].min(),
587
+ height_max_ratio=class_group["height"].max(),
588
+ width_avg_ratio=class_group["width"].mean(),
589
+ width_min_ratio=class_group["width"].min(),
590
+ width_max_ratio=class_group["width"].max(),
591
+ area_avg_px=class_group["area_px"].mean(),
592
+ area_min_px=int(class_group["area_px"].min()),
593
+ area_max_px=int(class_group["area_px"].max()),
594
+ height_avg_px=class_group["height_px"].mean(),
595
+ height_min_px=int(class_group["height_px"].min()),
596
+ height_max_px=int(class_group["height_px"].max()),
597
+ width_avg_px=class_group["width_px"].mean(),
598
+ width_min_px=int(class_group["width_px"].min()),
599
+ width_max_px=int(class_group["width_px"].max()),
437
600
  )
438
601
  )
439
602
 
440
- if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.polygon.Polygon):
603
+ if has_primitive(dataset_split, PrimitiveType=Polygon):
441
604
  raise NotImplementedError("Not Implemented yet")
442
605
 
443
606
  # Sort object reports by name to more easily compare between versions
@@ -456,13 +619,59 @@ def dataset_info_from_dataset(
456
619
  s3_bucket_name=bucket_sample,
457
620
  dataset_variants=dataset_variants,
458
621
  split_annotations_reports=dataset_reports,
459
- license_citation=dataset_meta_info.get("license_citation", None),
622
+ latest_update=dataset.info.updated_at,
623
+ dataset_format_version=dataset.info.format_version,
624
+ license_citation=dataset.info.reference_bibtex,
460
625
  data_captured_start=dataset_meta_info.get("data_captured_start", None),
461
626
  data_captured_end=dataset_meta_info.get("data_captured_end", None),
462
627
  data_received_start=dataset_meta_info.get("data_received_start", None),
463
628
  data_received_end=dataset_meta_info.get("data_received_end", None),
464
629
  annotation_project_id=dataset_meta_info.get("annotation_project_id", None),
465
630
  annotation_dataset_id=dataset_meta_info.get("annotation_dataset_id", None),
631
+ imgs=gallery_images,
466
632
  )
467
633
 
468
634
  return dataset_info
635
+
636
+
637
+ def create_gallery_images(
638
+ dataset: HafniaDataset,
639
+ path_gallery_images: Optional[Path],
640
+ gallery_image_names: Optional[List[str]],
641
+ ) -> Optional[List[DatasetImage]]:
642
+ gallery_images = None
643
+ if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
644
+ if path_gallery_images is None:
645
+ raise ValueError("Path to gallery images must be provided.")
646
+ path_gallery_images.mkdir(parents=True, exist_ok=True)
647
+ COL_IMAGE_NAME = "image_name"
648
+ samples = dataset.samples.with_columns(
649
+ dataset.samples[ColumnName.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
650
+ )
651
+ gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))
652
+
653
+ missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
654
+ if len(missing_gallery_samples):
655
+ raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
656
+ gallery_images = []
657
+ for gallery_sample in gallery_samples.iter_rows(named=True):
658
+ sample = Sample(**gallery_sample)
659
+
660
+ metadata = DatasetImageMetadata.from_sample(sample=sample)
661
+ sample.classifications = None # To not draw classifications in gallery images
662
+ image = sample.draw_annotations()
663
+
664
+ path_gallery_image = path_gallery_images / gallery_sample[COL_IMAGE_NAME]
665
+ Image.fromarray(image).save(path_gallery_image)
666
+
667
+ dataset_image_dict = {
668
+ "img": path_gallery_image,
669
+ "metadata": metadata,
670
+ }
671
+ if sample.attribution is not None:
672
+ sample.attribution.changes = "Annotations have been visualized"
673
+ dataset_image_dict.update(sample.attribution.model_dump(exclude_none=True))
674
+ gallery_img = DatasetImage(**dataset_image_dict)
675
+ gallery_img.licenses = gallery_img.licenses or []
676
+ gallery_images.append(gallery_img)
677
+ return gallery_images
@@ -0,0 +1,106 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import List, Optional
4
+
5
+ import more_itertools
6
+ import polars as pl
7
+ from PIL import Image
8
+ from rich.progress import track
9
+
10
+ from hafnia.dataset.dataset_names import ColumnName, FieldName
11
+ from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
12
+ from hafnia.dataset.primitives import Classification
13
+ from hafnia.utils import is_image_file
14
+
15
+
16
+ def import_image_classification_directory_tree(
17
+ path_folder: Path,
18
+ split: str,
19
+ n_samples: Optional[int] = None,
20
+ ) -> HafniaDataset:
21
+ class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
22
+ class_names = sorted([folder.name for folder in class_folder_paths]) # Sort for determinism
23
+
24
+ # Gather all image paths per class
25
+ path_images_per_class: List[List[Path]] = []
26
+ for path_class_folder in class_folder_paths:
27
+ per_class_images = []
28
+ for path_image in list(path_class_folder.rglob("*.*")):
29
+ if is_image_file(path_image):
30
+ per_class_images.append(path_image)
31
+ path_images_per_class.append(sorted(per_class_images))
32
+
33
+ # Interleave to ensure classes are balanced in the output dataset for n_samples < total
34
+ path_images = list(more_itertools.interleave_longest(*path_images_per_class))
35
+
36
+ if n_samples is not None:
37
+ path_images = path_images[:n_samples]
38
+
39
+ samples = []
40
+ for path_image_org in track(path_images, description="Convert 'image classification' dataset to Hafnia Dataset"):
41
+ class_name = path_image_org.parent.name
42
+
43
+ read_image = Image.open(path_image_org)
44
+ width, height = read_image.size
45
+
46
+ classifications = [Classification(class_name=class_name, class_idx=class_names.index(class_name))]
47
+ sample = Sample(
48
+ file_path=str(path_image_org.absolute()),
49
+ width=width,
50
+ height=height,
51
+ split=split,
52
+ classifications=classifications,
53
+ )
54
+ samples.append(sample)
55
+
56
+ dataset_info = DatasetInfo(
57
+ dataset_name="ImageClassificationFromDirectoryTree",
58
+ tasks=[TaskInfo(primitive=Classification, class_names=class_names)],
59
+ )
60
+
61
+ hafnia_dataset = HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
62
+ return hafnia_dataset
63
+
64
+
65
+ def export_image_classification_directory_tree(
66
+ dataset: HafniaDataset,
67
+ path_output: Path,
68
+ task_name: Optional[str] = None,
69
+ clean_folder: bool = False,
70
+ ) -> Path:
71
+ task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
72
+
73
+ samples = dataset.samples.with_columns(
74
+ pl.col(task.primitive.column_name())
75
+ .list.filter(pl.element().struct.field(FieldName.TASK_NAME) == task.name)
76
+ .alias(task.primitive.column_name())
77
+ )
78
+
79
+ classification_counts = samples[task.primitive.column_name()].list.len()
80
+ has_no_classification_samples = (classification_counts == 0).sum()
81
+ if has_no_classification_samples > 0:
82
+ raise ValueError(f"Some samples do not have a classification for task '{task.name}'.")
83
+
84
+ has_multi_classification_samples = (classification_counts > 1).sum()
85
+ if has_multi_classification_samples > 0:
86
+ raise ValueError(f"Some samples have multiple classifications for task '{task.name}'.")
87
+
88
+ if clean_folder:
89
+ shutil.rmtree(path_output, ignore_errors=True)
90
+ path_output.mkdir(parents=True, exist_ok=True)
91
+
92
+ description = "Export Hafnia Dataset to directory tree"
93
+ for sample_dict in track(samples.iter_rows(named=True), total=len(samples), description=description):
94
+ classifications = sample_dict[task.primitive.column_name()]
95
+ if len(classifications) != 1:
96
+ raise ValueError("Each sample should have exactly one classification.")
97
+ classification = classifications[0]
98
+ class_name = classification[FieldName.CLASS_NAME].replace("/", "_") # Avoid issues with subfolders
99
+ path_class_folder = path_output / class_name
100
+ path_class_folder.mkdir(parents=True, exist_ok=True)
101
+
102
+ path_image_org = Path(sample_dict[ColumnName.FILE_PATH])
103
+ path_image_new = path_class_folder / path_image_org.name
104
+ shutil.copy2(path_image_org, path_image_new)
105
+
106
+ return path_output