hafnia 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hafnia/dataset/{dataset_upload_helper.py → dataset_details_uploader.py} +148 -238
- hafnia/dataset/dataset_helpers.py +1 -15
- hafnia/dataset/dataset_names.py +43 -3
- hafnia/dataset/format_conversions/format_coco.py +490 -0
- hafnia/dataset/format_conversions/format_helpers.py +33 -0
- hafnia/dataset/format_conversions/format_image_classification_folder.py +95 -14
- hafnia/dataset/format_conversions/format_yolo.py +115 -25
- hafnia/dataset/format_conversions/torchvision_datasets.py +16 -11
- hafnia/dataset/hafnia_dataset.py +119 -490
- hafnia/dataset/hafnia_dataset_types.py +479 -0
- hafnia/dataset/license_types.py +4 -4
- hafnia/dataset/operations/dataset_s3_storage.py +211 -0
- hafnia/dataset/operations/dataset_stats.py +3 -3
- hafnia/dataset/operations/dataset_transformations.py +14 -17
- hafnia/dataset/operations/table_transformations.py +22 -14
- hafnia/dataset/primitives/bbox.py +6 -2
- hafnia/dataset/primitives/bitmask.py +21 -46
- hafnia/dataset/primitives/classification.py +1 -1
- hafnia/dataset/primitives/polygon.py +43 -2
- hafnia/dataset/primitives/primitive.py +1 -1
- hafnia/dataset/primitives/segmentation.py +1 -1
- hafnia/experiment/hafnia_logger.py +13 -4
- hafnia/http.py +2 -1
- hafnia/platform/datasets.py +195 -105
- hafnia/platform/s5cmd_utils.py +147 -0
- hafnia/torch_helpers.py +48 -4
- hafnia/utils.py +38 -0
- hafnia/visualizations/image_visualizations.py +3 -1
- {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/METADATA +4 -4
- hafnia-0.5.0.dist-info/RECORD +62 -0
- {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/WHEEL +1 -1
- hafnia_cli/dataset_cmds.py +18 -0
- hafnia_cli/profile_cmds.py +0 -1
- hafnia-0.4.2.dist-info/RECORD +0 -57
- {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.4.2.dist-info → hafnia-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,22 +4,21 @@ import base64
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional,
|
|
7
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
10
|
import polars as pl
|
|
11
11
|
from PIL import Image
|
|
12
12
|
from pydantic import BaseModel, ConfigDict, field_validator
|
|
13
13
|
|
|
14
|
-
from hafnia.dataset import primitives
|
|
15
14
|
from hafnia.dataset.dataset_names import (
|
|
16
15
|
DatasetVariant,
|
|
17
|
-
DeploymentStage,
|
|
18
16
|
PrimitiveField,
|
|
19
17
|
SampleField,
|
|
20
18
|
SplitName,
|
|
21
19
|
)
|
|
22
|
-
from hafnia.dataset.hafnia_dataset import
|
|
20
|
+
from hafnia.dataset.hafnia_dataset import HafniaDataset
|
|
21
|
+
from hafnia.dataset.hafnia_dataset_types import Attribution, Sample, TaskInfo
|
|
23
22
|
from hafnia.dataset.operations import table_transformations
|
|
24
23
|
from hafnia.dataset.primitives import (
|
|
25
24
|
Bbox,
|
|
@@ -29,26 +28,21 @@ from hafnia.dataset.primitives import (
|
|
|
29
28
|
Segmentation,
|
|
30
29
|
)
|
|
31
30
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
32
|
-
from hafnia.
|
|
33
|
-
from hafnia.
|
|
34
|
-
from hafnia.platform.datasets import get_dataset_id
|
|
31
|
+
from hafnia.platform.datasets import upload_dataset_details
|
|
32
|
+
from hafnia.utils import get_path_dataset_gallery_images
|
|
35
33
|
from hafnia_cli.config import Config
|
|
36
34
|
|
|
37
35
|
|
|
38
|
-
|
|
39
|
-
# TODO: When moving to versioning we do NOT need 'staging' and 'production' specific buckets
|
|
40
|
-
# and the new name convention should be: f"hafnia-dataset-{dataset_name}"
|
|
41
|
-
return f"mdi-{deployment_stage.value}-{dataset_name}"
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
36
|
+
class DatasetDetails(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
45
37
|
model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
|
|
46
38
|
name: str
|
|
39
|
+
title: Optional[str] = None
|
|
40
|
+
overview: Optional[str] = None
|
|
47
41
|
data_captured_start: Optional[datetime] = None
|
|
48
42
|
data_captured_end: Optional[datetime] = None
|
|
49
43
|
data_received_start: Optional[datetime] = None
|
|
50
44
|
data_received_end: Optional[datetime] = None
|
|
51
|
-
|
|
45
|
+
dataset_updated_at: Optional[datetime] = None
|
|
52
46
|
license_citation: Optional[str] = None
|
|
53
47
|
version: Optional[str] = None
|
|
54
48
|
s3_bucket_name: Optional[str] = None
|
|
@@ -150,14 +144,6 @@ class DbAnnotationType(BaseModel, validate_assignment=True): # type: ignore[cal
|
|
|
150
144
|
name: str
|
|
151
145
|
|
|
152
146
|
|
|
153
|
-
class AnnotationType(Enum):
|
|
154
|
-
ImageClassification = "Image Classification"
|
|
155
|
-
ObjectDetection = "Object Detection"
|
|
156
|
-
SegmentationMask = "Segmentation Mask"
|
|
157
|
-
ImageCaptioning = "Image Captioning"
|
|
158
|
-
InstanceSegmentation = "Instance Segmentation"
|
|
159
|
-
|
|
160
|
-
|
|
161
147
|
class DbResolution(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
162
148
|
height: int
|
|
163
149
|
width: int
|
|
@@ -289,26 +275,32 @@ def get_folder_size(path: Path) -> int:
|
|
|
289
275
|
return sum([path.stat().st_size for path in path.rglob("*")])
|
|
290
276
|
|
|
291
277
|
|
|
292
|
-
def
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
278
|
+
def upload_dataset_details_to_platform(
|
|
279
|
+
dataset: HafniaDataset,
|
|
280
|
+
path_gallery_images: Optional[Path] = None,
|
|
281
|
+
gallery_image_names: Optional[List[str]] = None,
|
|
282
|
+
distribution_task_names: Optional[List[str]] = None,
|
|
283
|
+
update_platform: bool = True,
|
|
284
|
+
cfg: Optional[Config] = None,
|
|
285
|
+
) -> dict:
|
|
286
|
+
cfg = cfg or Config()
|
|
287
|
+
dataset_details = dataset_details_from_hafnia_dataset(
|
|
288
|
+
dataset=dataset,
|
|
289
|
+
path_gallery_images=path_gallery_images,
|
|
290
|
+
gallery_image_names=gallery_image_names,
|
|
291
|
+
distribution_task_names=distribution_task_names,
|
|
292
|
+
)
|
|
305
293
|
|
|
306
|
-
|
|
307
|
-
|
|
294
|
+
if update_platform:
|
|
295
|
+
dataset_details_exclude_none = dataset_details.model_dump(exclude_none=True, mode="json")
|
|
296
|
+
upload_dataset_details(
|
|
297
|
+
cfg=cfg,
|
|
298
|
+
data=dataset_details_exclude_none,
|
|
299
|
+
dataset_name=dataset_details.name,
|
|
300
|
+
)
|
|
308
301
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
return response # type: ignore[return-value]
|
|
302
|
+
dataset_details_dict = dataset_details.model_dump(exclude_none=False, mode="json")
|
|
303
|
+
return dataset_details_dict
|
|
312
304
|
|
|
313
305
|
|
|
314
306
|
def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
|
|
@@ -322,18 +314,6 @@ def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -
|
|
|
322
314
|
return resolutions
|
|
323
315
|
|
|
324
316
|
|
|
325
|
-
def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Type[Primitive]) -> bool:
|
|
326
|
-
col_name = PrimitiveType.column_name()
|
|
327
|
-
table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
|
|
328
|
-
if col_name not in table.columns:
|
|
329
|
-
return False
|
|
330
|
-
|
|
331
|
-
if table[col_name].dtype == pl.Null:
|
|
332
|
-
return False
|
|
333
|
-
|
|
334
|
-
return True
|
|
335
|
-
|
|
336
|
-
|
|
337
317
|
def calculate_distribution_values(
|
|
338
318
|
dataset_split: pl.DataFrame, distribution_tasks: Optional[List[TaskInfo]]
|
|
339
319
|
) -> List[DbDistributionValue]:
|
|
@@ -378,46 +358,34 @@ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: bot
|
|
|
378
358
|
return last_modified, size
|
|
379
359
|
|
|
380
360
|
|
|
381
|
-
def
|
|
361
|
+
def dataset_details_from_hafnia_dataset(
|
|
382
362
|
dataset: HafniaDataset,
|
|
383
|
-
deployment_stage: DeploymentStage,
|
|
384
|
-
path_sample: Optional[Path],
|
|
385
|
-
path_hidden: Optional[Path],
|
|
386
363
|
path_gallery_images: Optional[Path] = None,
|
|
387
364
|
gallery_image_names: Optional[List[str]] = None,
|
|
388
|
-
distribution_task_names: Optional[List[
|
|
389
|
-
) ->
|
|
365
|
+
distribution_task_names: Optional[List[str]] = None,
|
|
366
|
+
) -> DatasetDetails:
|
|
390
367
|
dataset_variants = []
|
|
391
368
|
dataset_reports = []
|
|
392
369
|
dataset_meta_info = dataset.info.meta or {}
|
|
393
370
|
|
|
394
|
-
path_and_variant
|
|
395
|
-
if path_sample is not None:
|
|
396
|
-
path_and_variant.append((path_sample, DatasetVariant.SAMPLE))
|
|
397
|
-
|
|
398
|
-
if path_hidden is not None:
|
|
399
|
-
path_and_variant.append((path_hidden, DatasetVariant.HIDDEN))
|
|
400
|
-
|
|
401
|
-
if len(path_and_variant) == 0:
|
|
402
|
-
raise ValueError("At least one path must be provided for sample or hidden dataset.")
|
|
403
|
-
|
|
371
|
+
path_and_variant = [DatasetVariant.SAMPLE, DatasetVariant.HIDDEN]
|
|
404
372
|
gallery_images = create_gallery_images(
|
|
405
373
|
dataset=dataset,
|
|
406
374
|
path_gallery_images=path_gallery_images,
|
|
407
375
|
gallery_image_names=gallery_image_names,
|
|
408
376
|
)
|
|
409
377
|
|
|
410
|
-
for
|
|
378
|
+
for variant_type in path_and_variant:
|
|
411
379
|
if variant_type == DatasetVariant.SAMPLE:
|
|
412
380
|
dataset_variant = dataset.create_sample_dataset()
|
|
413
381
|
else:
|
|
414
382
|
dataset_variant = dataset
|
|
415
383
|
|
|
416
|
-
|
|
384
|
+
files_paths = dataset_variant.samples[SampleField.FILE_PATH].to_list()
|
|
385
|
+
size_bytes = sum([Path(file_path).stat().st_size for file_path in files_paths])
|
|
417
386
|
dataset_variants.append(
|
|
418
387
|
DbDatasetVariant(
|
|
419
388
|
variant_type=VARIANT_TYPE_MAPPING[variant_type], # type: ignore[index]
|
|
420
|
-
# upload_date: Optional[datetime] = None
|
|
421
389
|
size_bytes=size_bytes,
|
|
422
390
|
data_type=DataTypeChoices.images,
|
|
423
391
|
number_of_data_items=len(dataset_variant),
|
|
@@ -425,7 +393,6 @@ def dataset_info_from_dataset(
|
|
|
425
393
|
duration=dataset_meta_info.get("duration", None),
|
|
426
394
|
duration_average=dataset_meta_info.get("duration_average", None),
|
|
427
395
|
frame_rate=dataset_meta_info.get("frame_rate", None),
|
|
428
|
-
# bit_rate: Optional[float] = None
|
|
429
396
|
n_cameras=dataset_meta_info.get("n_cameras", None),
|
|
430
397
|
)
|
|
431
398
|
)
|
|
@@ -448,165 +415,8 @@ def dataset_info_from_dataset(
|
|
|
448
415
|
)
|
|
449
416
|
|
|
450
417
|
object_reports: List[DbAnnotatedObjectReport] = []
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
df_per_instance = table_transformations.create_primitive_table(
|
|
454
|
-
dataset_split, PrimitiveType=Bbox, keep_sample_data=True
|
|
455
|
-
)
|
|
456
|
-
if df_per_instance is None:
|
|
457
|
-
raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
|
|
458
|
-
# Calculate area of bounding boxes
|
|
459
|
-
df_per_instance = df_per_instance.with_columns(
|
|
460
|
-
(pl.col("height") * pl.col("width")).alias("area"),
|
|
461
|
-
).with_columns(
|
|
462
|
-
(pl.col("height") * pl.col("image.height")).alias("height_px"),
|
|
463
|
-
(pl.col("width") * pl.col("image.width")).alias("width_px"),
|
|
464
|
-
(pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
|
|
465
|
-
)
|
|
466
|
-
|
|
467
|
-
annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
|
|
468
|
-
for (class_name, task_name), class_group in df_per_instance.group_by(
|
|
469
|
-
PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
|
|
470
|
-
):
|
|
471
|
-
if class_name is None:
|
|
472
|
-
continue
|
|
473
|
-
object_reports.append(
|
|
474
|
-
DbAnnotatedObjectReport(
|
|
475
|
-
obj=DbAnnotatedObject(
|
|
476
|
-
name=class_name,
|
|
477
|
-
entity_type=EntityTypeChoices.OBJECT.value,
|
|
478
|
-
annotation_type=annotation_type,
|
|
479
|
-
task_name=task_name,
|
|
480
|
-
),
|
|
481
|
-
unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
|
|
482
|
-
obj_instances=len(class_group),
|
|
483
|
-
annotation_type=[annotation_type],
|
|
484
|
-
images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
485
|
-
area_avg_ratio=class_group["area"].mean(),
|
|
486
|
-
area_min_ratio=class_group["area"].min(),
|
|
487
|
-
area_max_ratio=class_group["area"].max(),
|
|
488
|
-
height_avg_ratio=class_group["height"].mean(),
|
|
489
|
-
height_min_ratio=class_group["height"].min(),
|
|
490
|
-
height_max_ratio=class_group["height"].max(),
|
|
491
|
-
width_avg_ratio=class_group["width"].mean(),
|
|
492
|
-
width_min_ratio=class_group["width"].min(),
|
|
493
|
-
width_max_ratio=class_group["width"].max(),
|
|
494
|
-
area_avg_px=class_group["area_px"].mean(),
|
|
495
|
-
area_min_px=int(class_group["area_px"].min()),
|
|
496
|
-
area_max_px=int(class_group["area_px"].max()),
|
|
497
|
-
height_avg_px=class_group["height_px"].mean(),
|
|
498
|
-
height_min_px=int(class_group["height_px"].min()),
|
|
499
|
-
height_max_px=int(class_group["height_px"].max()),
|
|
500
|
-
width_avg_px=class_group["width_px"].mean(),
|
|
501
|
-
width_min_px=int(class_group["width_px"].min()),
|
|
502
|
-
width_max_px=int(class_group["width_px"].max()),
|
|
503
|
-
average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
504
|
-
)
|
|
505
|
-
)
|
|
506
|
-
|
|
507
|
-
if has_primitive(dataset_split, PrimitiveType=Classification):
|
|
508
|
-
annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
|
|
509
|
-
col_name = Classification.column_name()
|
|
510
|
-
classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
|
|
511
|
-
has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
|
|
512
|
-
if has_classification_data:
|
|
513
|
-
classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
|
|
514
|
-
|
|
515
|
-
# Include only classification tasks that are defined in the dataset info
|
|
516
|
-
classification_df = classification_df.filter(
|
|
517
|
-
pl.col(PrimitiveField.TASK_NAME).is_in(classification_tasks)
|
|
518
|
-
)
|
|
519
|
-
|
|
520
|
-
for (
|
|
521
|
-
task_name,
|
|
522
|
-
class_name,
|
|
523
|
-
), class_group in classification_df.group_by(PrimitiveField.TASK_NAME, PrimitiveField.CLASS_NAME):
|
|
524
|
-
if class_name is None:
|
|
525
|
-
continue
|
|
526
|
-
if task_name == Classification.default_task_name():
|
|
527
|
-
display_name = class_name # Prefix class name with task name
|
|
528
|
-
else:
|
|
529
|
-
display_name = f"{task_name}.{class_name}"
|
|
530
|
-
object_reports.append(
|
|
531
|
-
DbAnnotatedObjectReport(
|
|
532
|
-
obj=DbAnnotatedObject(
|
|
533
|
-
name=display_name,
|
|
534
|
-
entity_type=EntityTypeChoices.EVENT.value,
|
|
535
|
-
annotation_type=annotation_type,
|
|
536
|
-
task_name=task_name,
|
|
537
|
-
),
|
|
538
|
-
unique_obj_ids=len(
|
|
539
|
-
class_group
|
|
540
|
-
), # Unique object IDs are not applicable for classification
|
|
541
|
-
obj_instances=len(class_group),
|
|
542
|
-
annotation_type=[annotation_type],
|
|
543
|
-
)
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
if has_primitive(dataset_split, PrimitiveType=Segmentation):
|
|
547
|
-
raise NotImplementedError("Not Implemented yet")
|
|
548
|
-
|
|
549
|
-
if has_primitive(dataset_split, PrimitiveType=Bitmask):
|
|
550
|
-
col_name = Bitmask.column_name()
|
|
551
|
-
drop_columns = [col for col in primitive_columns if col != col_name]
|
|
552
|
-
drop_columns.append(PrimitiveField.META)
|
|
553
|
-
|
|
554
|
-
df_per_instance = table_transformations.create_primitive_table(
|
|
555
|
-
dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
|
|
556
|
-
)
|
|
557
|
-
if df_per_instance is None:
|
|
558
|
-
raise ValueError(
|
|
559
|
-
f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
|
|
560
|
-
)
|
|
561
|
-
df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
|
|
562
|
-
df_per_instance = df_per_instance.with_columns(
|
|
563
|
-
(pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
|
|
564
|
-
(pl.col("height_px") / pl.col("image.height")).alias("height"),
|
|
565
|
-
(pl.col("width_px") / pl.col("image.width")).alias("width"),
|
|
566
|
-
)
|
|
567
|
-
|
|
568
|
-
annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
|
|
569
|
-
for (class_name, task_name), class_group in df_per_instance.group_by(
|
|
570
|
-
PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
|
|
571
|
-
):
|
|
572
|
-
if class_name is None:
|
|
573
|
-
continue
|
|
574
|
-
object_reports.append(
|
|
575
|
-
DbAnnotatedObjectReport(
|
|
576
|
-
obj=DbAnnotatedObject(
|
|
577
|
-
name=class_name,
|
|
578
|
-
entity_type=EntityTypeChoices.OBJECT.value,
|
|
579
|
-
annotation_type=annotation_type,
|
|
580
|
-
task_name=task_name,
|
|
581
|
-
),
|
|
582
|
-
unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
|
|
583
|
-
obj_instances=len(class_group),
|
|
584
|
-
annotation_type=[annotation_type],
|
|
585
|
-
average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
586
|
-
images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
587
|
-
area_avg_ratio=class_group["area"].mean(),
|
|
588
|
-
area_min_ratio=class_group["area"].min(),
|
|
589
|
-
area_max_ratio=class_group["area"].max(),
|
|
590
|
-
height_avg_ratio=class_group["height"].mean(),
|
|
591
|
-
height_min_ratio=class_group["height"].min(),
|
|
592
|
-
height_max_ratio=class_group["height"].max(),
|
|
593
|
-
width_avg_ratio=class_group["width"].mean(),
|
|
594
|
-
width_min_ratio=class_group["width"].min(),
|
|
595
|
-
width_max_ratio=class_group["width"].max(),
|
|
596
|
-
area_avg_px=class_group["area_px"].mean(),
|
|
597
|
-
area_min_px=int(class_group["area_px"].min()),
|
|
598
|
-
area_max_px=int(class_group["area_px"].max()),
|
|
599
|
-
height_avg_px=class_group["height_px"].mean(),
|
|
600
|
-
height_min_px=int(class_group["height_px"].min()),
|
|
601
|
-
height_max_px=int(class_group["height_px"].max()),
|
|
602
|
-
width_avg_px=class_group["width_px"].mean(),
|
|
603
|
-
width_min_px=int(class_group["width_px"].min()),
|
|
604
|
-
width_max_px=int(class_group["width_px"].max()),
|
|
605
|
-
)
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
if has_primitive(dataset_split, PrimitiveType=Polygon):
|
|
609
|
-
raise NotImplementedError("Not Implemented yet")
|
|
418
|
+
for PrimitiveType in [Classification, Bbox, Bitmask, Polygon, Segmentation]:
|
|
419
|
+
object_reports.extend(create_reports_from_primitive(dataset_split, PrimitiveType=PrimitiveType)) # type: ignore[type-abstract]
|
|
610
420
|
|
|
611
421
|
# Sort object reports by name to more easily compare between versions
|
|
612
422
|
object_reports = sorted(object_reports, key=lambda x: x.obj.name) # Sort object reports by name
|
|
@@ -617,14 +427,14 @@ def dataset_info_from_dataset(
|
|
|
617
427
|
|
|
618
428
|
dataset_reports.append(report)
|
|
619
429
|
dataset_name = dataset.info.dataset_name
|
|
620
|
-
|
|
621
|
-
dataset_info = DbDataset(
|
|
430
|
+
dataset_info = DatasetDetails(
|
|
622
431
|
name=dataset_name,
|
|
432
|
+
title=dataset.info.dataset_title,
|
|
433
|
+
overview=dataset.info.description,
|
|
623
434
|
version=dataset.info.version,
|
|
624
|
-
s3_bucket_name=bucket_sample,
|
|
625
435
|
dataset_variants=dataset_variants,
|
|
626
436
|
split_annotations_reports=dataset_reports,
|
|
627
|
-
|
|
437
|
+
dataset_updated_at=dataset.info.updated_at,
|
|
628
438
|
dataset_format_version=dataset.info.format_version,
|
|
629
439
|
license_citation=dataset.info.reference_bibtex,
|
|
630
440
|
data_captured_start=dataset_meta_info.get("data_captured_start", None),
|
|
@@ -639,6 +449,101 @@ def dataset_info_from_dataset(
|
|
|
639
449
|
return dataset_info
|
|
640
450
|
|
|
641
451
|
|
|
452
|
+
def create_reports_from_primitive(
|
|
453
|
+
dataset_split: pl.DataFrame, PrimitiveType: Type[Primitive]
|
|
454
|
+
) -> List[DbAnnotatedObjectReport]:
|
|
455
|
+
if not table_transformations.has_primitive(dataset_split, PrimitiveType=PrimitiveType):
|
|
456
|
+
return []
|
|
457
|
+
|
|
458
|
+
if PrimitiveType == Segmentation:
|
|
459
|
+
raise NotImplementedError("Not Implemented yet")
|
|
460
|
+
|
|
461
|
+
df_per_instance = table_transformations.create_primitive_table(
|
|
462
|
+
dataset_split, PrimitiveType=PrimitiveType, keep_sample_data=True
|
|
463
|
+
)
|
|
464
|
+
if df_per_instance is None:
|
|
465
|
+
raise ValueError(f"Expected {PrimitiveType.__name__} primitive column to be present in the dataset split.")
|
|
466
|
+
|
|
467
|
+
entity_type = EntityTypeChoices.OBJECT.value
|
|
468
|
+
if PrimitiveType == Classification:
|
|
469
|
+
entity_type = EntityTypeChoices.EVENT.value
|
|
470
|
+
|
|
471
|
+
if PrimitiveType == Bbox:
|
|
472
|
+
df_per_instance = df_per_instance.with_columns(area=pl.col("height") * pl.col("width"))
|
|
473
|
+
|
|
474
|
+
if PrimitiveType == Bitmask:
|
|
475
|
+
# width and height are in pixel format for Bitmask convert to ratio
|
|
476
|
+
df_per_instance = df_per_instance.with_columns(
|
|
477
|
+
width=pl.col("width") / pl.col("image.width"),
|
|
478
|
+
height=pl.col("height") / pl.col("image.height"),
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
has_height_field = "height" in df_per_instance.columns and df_per_instance["height"].dtype != pl.Null
|
|
482
|
+
if has_height_field:
|
|
483
|
+
df_per_instance = df_per_instance.with_columns(
|
|
484
|
+
height_px=pl.col("height") * pl.col("image.height"),
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
has_width_field = "width" in df_per_instance.columns and df_per_instance["width"].dtype != pl.Null
|
|
488
|
+
if has_width_field:
|
|
489
|
+
df_per_instance = df_per_instance.with_columns(
|
|
490
|
+
width_px=pl.col("width") * pl.col("image.width"),
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
has_area_field = "area" in df_per_instance.columns and df_per_instance["area"].dtype != pl.Null
|
|
494
|
+
if has_area_field:
|
|
495
|
+
df_per_instance = df_per_instance.with_columns(
|
|
496
|
+
area_px=pl.col("image.height") * pl.col("image.width") * pl.col("area")
|
|
497
|
+
)
|
|
498
|
+
object_reports: List[DbAnnotatedObjectReport] = []
|
|
499
|
+
annotation_type = DbAnnotationType(name=PrimitiveType.__name__)
|
|
500
|
+
for (class_name, task_name), class_group in df_per_instance.group_by(
|
|
501
|
+
PrimitiveField.CLASS_NAME, PrimitiveField.TASK_NAME
|
|
502
|
+
):
|
|
503
|
+
if class_name is None:
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
object_report = DbAnnotatedObjectReport(
|
|
507
|
+
obj=DbAnnotatedObject(
|
|
508
|
+
name=class_name,
|
|
509
|
+
entity_type=entity_type,
|
|
510
|
+
annotation_type=annotation_type,
|
|
511
|
+
task_name=task_name,
|
|
512
|
+
),
|
|
513
|
+
unique_obj_ids=class_group[PrimitiveField.OBJECT_ID].n_unique(),
|
|
514
|
+
obj_instances=len(class_group),
|
|
515
|
+
annotation_type=[annotation_type],
|
|
516
|
+
average_count_per_image=len(class_group) / class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
517
|
+
images_with_obj=class_group[SampleField.SAMPLE_INDEX].n_unique(),
|
|
518
|
+
)
|
|
519
|
+
if has_height_field:
|
|
520
|
+
object_report.height_avg_ratio = class_group["height"].mean()
|
|
521
|
+
object_report.height_min_ratio = class_group["height"].min()
|
|
522
|
+
object_report.height_max_ratio = class_group["height"].max()
|
|
523
|
+
object_report.height_avg_px = class_group["height_px"].mean()
|
|
524
|
+
object_report.height_min_px = int(class_group["height_px"].min())
|
|
525
|
+
object_report.height_max_px = int(class_group["height_px"].max())
|
|
526
|
+
|
|
527
|
+
if has_width_field:
|
|
528
|
+
object_report.width_avg_ratio = class_group["width"].mean()
|
|
529
|
+
object_report.width_min_ratio = class_group["width"].min()
|
|
530
|
+
object_report.width_max_ratio = class_group["width"].max()
|
|
531
|
+
object_report.width_avg_px = class_group["width_px"].mean()
|
|
532
|
+
object_report.width_min_px = int(class_group["width_px"].min())
|
|
533
|
+
object_report.width_max_px = int(class_group["width_px"].max())
|
|
534
|
+
|
|
535
|
+
if has_area_field:
|
|
536
|
+
object_report.area_avg_ratio = class_group["area"].mean()
|
|
537
|
+
object_report.area_min_ratio = class_group["area"].min()
|
|
538
|
+
object_report.area_max_ratio = class_group["area"].max()
|
|
539
|
+
object_report.area_avg_px = class_group["area_px"].mean()
|
|
540
|
+
object_report.area_min_px = int(class_group["area_px"].min())
|
|
541
|
+
object_report.area_max_px = int(class_group["area_px"].max())
|
|
542
|
+
|
|
543
|
+
object_reports.append(object_report)
|
|
544
|
+
return object_reports
|
|
545
|
+
|
|
546
|
+
|
|
642
547
|
def create_gallery_images(
|
|
643
548
|
dataset: HafniaDataset,
|
|
644
549
|
path_gallery_images: Optional[Path],
|
|
@@ -647,7 +552,7 @@ def create_gallery_images(
|
|
|
647
552
|
gallery_images = None
|
|
648
553
|
if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
|
|
649
554
|
if path_gallery_images is None:
|
|
650
|
-
|
|
555
|
+
path_gallery_images = get_path_dataset_gallery_images(dataset.info.dataset_name)
|
|
651
556
|
path_gallery_images.mkdir(parents=True, exist_ok=True)
|
|
652
557
|
COL_IMAGE_NAME = "image_name"
|
|
653
558
|
samples = dataset.samples.with_columns(
|
|
@@ -657,7 +562,12 @@ def create_gallery_images(
|
|
|
657
562
|
|
|
658
563
|
missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
|
|
659
564
|
if len(missing_gallery_samples):
|
|
660
|
-
|
|
565
|
+
potential_samples = samples[COL_IMAGE_NAME].sort().to_list()
|
|
566
|
+
formatted_samples = ", ".join([f'"{s}"' for s in potential_samples[:9]])
|
|
567
|
+
raise ValueError(
|
|
568
|
+
f"Gallery images not found in dataset: {missing_gallery_samples}. "
|
|
569
|
+
f"Consider adding this to dataset definition: \ngallery_image_names=[{formatted_samples}]"
|
|
570
|
+
)
|
|
661
571
|
gallery_images = []
|
|
662
572
|
for gallery_sample in gallery_samples.iter_rows(named=True):
|
|
663
573
|
sample = Sample(**gallery_sample)
|
|
@@ -57,20 +57,6 @@ def save_pil_image_with_hash_name(image: Image.Image, path_folder: Path, allow_s
|
|
|
57
57
|
def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Path) -> Path:
|
|
58
58
|
"""
|
|
59
59
|
Copies a file to a dataset root directory with a hash-based name and sub-directory structure.
|
|
60
|
-
|
|
61
|
-
E.g. for an "image.png" with hash "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4", the image will be copied to
|
|
62
|
-
'path_dataset_root / "data" / "dfe" / "dfe8f3b1c2a4f5b6c7d8e9f0a1b2c3d4.png"'
|
|
63
|
-
Notice that the hash is used for both the filename and the subfolder name.
|
|
64
|
-
|
|
65
|
-
Placing image/video files into multiple sub-folders (instead of one large folder) is seemingly
|
|
66
|
-
unnecessary, but it is actually a requirement when the dataset is later downloaded from S3.
|
|
67
|
-
|
|
68
|
-
The reason is that AWS has a rate limit of 3500 ops/sec per prefix (sub-folder) in S3 - meaning we can "only"
|
|
69
|
-
download 3500 files per second from a single folder (prefix) in S3.
|
|
70
|
-
|
|
71
|
-
For even a single user, we found that this limit was being reached when files are stored in single folder (prefix)
|
|
72
|
-
in S3. To support multiple users and concurrent experiments, we are required to separate files into
|
|
73
|
-
multiple sub-folders (prefixes) in S3 to not hit the rate limit.
|
|
74
60
|
"""
|
|
75
61
|
|
|
76
62
|
if not path_source.exists():
|
|
@@ -86,7 +72,7 @@ def copy_and_rename_file_to_hash_value(path_source: Path, path_dataset_root: Pat
|
|
|
86
72
|
|
|
87
73
|
|
|
88
74
|
def relative_path_from_hash(hash: str, suffix: str) -> Path:
|
|
89
|
-
path_file = Path("data") /
|
|
75
|
+
path_file = Path("data") / f"{hash}{suffix}"
|
|
90
76
|
return path_file
|
|
91
77
|
|
|
92
78
|
|
hafnia/dataset/dataset_names.py
CHANGED
|
@@ -2,6 +2,7 @@ from enum import Enum
|
|
|
2
2
|
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
import boto3
|
|
5
|
+
from botocore.exceptions import UnauthorizedSSOTokenError
|
|
5
6
|
from pydantic import BaseModel, field_validator
|
|
6
7
|
|
|
7
8
|
FILENAME_RECIPE_JSON = "recipe.json"
|
|
@@ -21,6 +22,7 @@ class DeploymentStage(Enum):
|
|
|
21
22
|
PRODUCTION = "production"
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
ARN_PREFIX = "arn:aws:s3:::"
|
|
24
26
|
TAG_IS_SAMPLE = "sample"
|
|
25
27
|
|
|
26
28
|
OPS_REMOVE_CLASS = "__REMOVE__"
|
|
@@ -93,6 +95,32 @@ class SplitName:
|
|
|
93
95
|
def all_split_names() -> List[str]:
|
|
94
96
|
return [*SplitName.valid_splits(), SplitName.UNDEFINED]
|
|
95
97
|
|
|
98
|
+
@staticmethod
|
|
99
|
+
def map_split_name(potential_split_name: str, strict: bool = True) -> str:
|
|
100
|
+
normalized = potential_split_name.strip().lower()
|
|
101
|
+
|
|
102
|
+
if normalized in SPLIT_NAME_MAPPINGS:
|
|
103
|
+
return SPLIT_NAME_MAPPINGS[normalized]
|
|
104
|
+
|
|
105
|
+
if strict:
|
|
106
|
+
raise ValueError(f"Unrecognized split name: {potential_split_name}")
|
|
107
|
+
else:
|
|
108
|
+
return SplitName.UNDEFINED
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
SPLIT_NAME_MAPPINGS = {
|
|
112
|
+
# Train variations
|
|
113
|
+
"train": SplitName.TRAIN,
|
|
114
|
+
"training": SplitName.TRAIN,
|
|
115
|
+
# Validation variations
|
|
116
|
+
"validation": SplitName.VAL,
|
|
117
|
+
"val": SplitName.VAL,
|
|
118
|
+
"valid": SplitName.VAL,
|
|
119
|
+
# Test variations
|
|
120
|
+
"test": SplitName.TEST,
|
|
121
|
+
"testing": SplitName.TEST,
|
|
122
|
+
}
|
|
123
|
+
|
|
96
124
|
|
|
97
125
|
class DatasetVariant(Enum):
|
|
98
126
|
DUMP = "dump"
|
|
@@ -125,7 +153,14 @@ class AwsCredentials(BaseModel):
|
|
|
125
153
|
"""
|
|
126
154
|
Creates AwsCredentials from a Boto3 session.
|
|
127
155
|
"""
|
|
128
|
-
|
|
156
|
+
try:
|
|
157
|
+
frozen_credentials = session.get_credentials().get_frozen_credentials()
|
|
158
|
+
except UnauthorizedSSOTokenError as e:
|
|
159
|
+
raise RuntimeError(
|
|
160
|
+
f"Failed to get AWS credentials from the session for profile '{session.profile_name}'.\n"
|
|
161
|
+
f"Ensure the profile exists in your AWS config in '~/.aws/config' and that you are logged in via AWS SSO.\n"
|
|
162
|
+
f"\tUse 'aws sso login --profile {session.profile_name}' to log in."
|
|
163
|
+
) from e
|
|
129
164
|
return AwsCredentials(
|
|
130
165
|
access_key=frozen_credentials.access_key,
|
|
131
166
|
secret_key=frozen_credentials.secret_key,
|
|
@@ -133,8 +168,13 @@ class AwsCredentials(BaseModel):
|
|
|
133
168
|
region=session.region_name,
|
|
134
169
|
)
|
|
135
170
|
|
|
136
|
-
|
|
137
|
-
|
|
171
|
+
def to_resource_credentials(self, bucket_name: str) -> "ResourceCredentials":
|
|
172
|
+
"""
|
|
173
|
+
Converts AwsCredentials to ResourceCredentials by adding the S3 ARN.
|
|
174
|
+
"""
|
|
175
|
+
payload = self.model_dump()
|
|
176
|
+
payload["s3_arn"] = f"{ARN_PREFIX}{bucket_name}"
|
|
177
|
+
return ResourceCredentials(**payload)
|
|
138
178
|
|
|
139
179
|
|
|
140
180
|
class ResourceCredentials(AwsCredentials):
|