hafnia 0.2.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__main__.py +16 -3
- cli/config.py +45 -4
- cli/consts.py +1 -1
- cli/dataset_cmds.py +6 -14
- cli/dataset_recipe_cmds.py +78 -0
- cli/experiment_cmds.py +226 -43
- cli/keychain.py +88 -0
- cli/profile_cmds.py +10 -6
- cli/runc_cmds.py +5 -5
- cli/trainer_package_cmds.py +65 -0
- hafnia/__init__.py +2 -0
- hafnia/data/factory.py +1 -2
- hafnia/dataset/dataset_helpers.py +9 -14
- hafnia/dataset/dataset_names.py +10 -5
- hafnia/dataset/dataset_recipe/dataset_recipe.py +165 -67
- hafnia/dataset/dataset_recipe/recipe_transforms.py +48 -4
- hafnia/dataset/dataset_recipe/recipe_types.py +1 -1
- hafnia/dataset/dataset_upload_helper.py +265 -56
- hafnia/dataset/format_conversions/image_classification_from_directory.py +106 -0
- hafnia/dataset/format_conversions/torchvision_datasets.py +281 -0
- hafnia/dataset/hafnia_dataset.py +577 -213
- hafnia/dataset/license_types.py +63 -0
- hafnia/dataset/operations/dataset_stats.py +259 -3
- hafnia/dataset/operations/dataset_transformations.py +332 -7
- hafnia/dataset/operations/table_transformations.py +43 -5
- hafnia/dataset/primitives/__init__.py +8 -0
- hafnia/dataset/primitives/bbox.py +25 -12
- hafnia/dataset/primitives/bitmask.py +26 -14
- hafnia/dataset/primitives/classification.py +16 -8
- hafnia/dataset/primitives/point.py +7 -3
- hafnia/dataset/primitives/polygon.py +16 -9
- hafnia/dataset/primitives/segmentation.py +10 -7
- hafnia/experiment/hafnia_logger.py +111 -8
- hafnia/http.py +16 -2
- hafnia/platform/__init__.py +9 -3
- hafnia/platform/builder.py +12 -10
- hafnia/platform/dataset_recipe.py +104 -0
- hafnia/platform/datasets.py +47 -9
- hafnia/platform/download.py +25 -19
- hafnia/platform/experiment.py +51 -56
- hafnia/platform/trainer_package.py +57 -0
- hafnia/utils.py +81 -13
- hafnia/visualizations/image_visualizations.py +4 -4
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/METADATA +40 -34
- hafnia-0.4.0.dist-info/RECORD +56 -0
- cli/recipe_cmds.py +0 -45
- hafnia-0.2.4.dist-info/RECORD +0 -49
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/WHEEL +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/entry_points.txt +0 -0
- {hafnia-0.2.4.dist-info → hafnia-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,19 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import base64
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from enum import Enum
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Dict, List, Optional, Tuple, Type, Union
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
7
8
|
|
|
8
9
|
import boto3
|
|
9
10
|
import polars as pl
|
|
10
|
-
from
|
|
11
|
+
from PIL import Image
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, field_validator
|
|
11
13
|
|
|
12
|
-
import hafnia.dataset.primitives.bbox
|
|
13
|
-
import hafnia.dataset.primitives.bitmask
|
|
14
|
-
import hafnia.dataset.primitives.classification
|
|
15
|
-
import hafnia.dataset.primitives.polygon
|
|
16
|
-
import hafnia.dataset.primitives.segmentation
|
|
17
14
|
from cli.config import Config
|
|
18
15
|
from hafnia.dataset import primitives
|
|
19
16
|
from hafnia.dataset.dataset_names import (
|
|
@@ -23,11 +20,19 @@ from hafnia.dataset.dataset_names import (
|
|
|
23
20
|
FieldName,
|
|
24
21
|
SplitName,
|
|
25
22
|
)
|
|
26
|
-
from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
|
|
23
|
+
from hafnia.dataset.hafnia_dataset import Attribution, HafniaDataset, Sample, TaskInfo
|
|
24
|
+
from hafnia.dataset.operations import table_transformations
|
|
25
|
+
from hafnia.dataset.primitives import (
|
|
26
|
+
Bbox,
|
|
27
|
+
Bitmask,
|
|
28
|
+
Classification,
|
|
29
|
+
Polygon,
|
|
30
|
+
Segmentation,
|
|
31
|
+
)
|
|
27
32
|
from hafnia.dataset.primitives.primitive import Primitive
|
|
28
33
|
from hafnia.http import post
|
|
29
34
|
from hafnia.log import user_logger
|
|
30
|
-
from hafnia.platform import get_dataset_id
|
|
35
|
+
from hafnia.platform.datasets import get_dataset_id
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
|
|
@@ -47,13 +52,14 @@ class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
|
47
52
|
license_citation: Optional[str] = None
|
|
48
53
|
version: Optional[str] = None
|
|
49
54
|
s3_bucket_name: Optional[str] = None
|
|
55
|
+
dataset_format_version: Optional[str] = None
|
|
50
56
|
annotation_date: Optional[datetime] = None
|
|
51
57
|
annotation_project_id: Optional[str] = None
|
|
52
58
|
annotation_dataset_id: Optional[str] = None
|
|
53
59
|
annotation_ontology: Optional[str] = None
|
|
54
60
|
dataset_variants: Optional[List[DbDatasetVariant]] = None
|
|
55
61
|
split_annotations_reports: Optional[List[DbSplitAnnotationsReport]] = None
|
|
56
|
-
|
|
62
|
+
imgs: Optional[List[DatasetImage]] = None
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
class DbDatasetVariant(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
@@ -75,6 +81,8 @@ class DbAnnotatedObject(BaseModel, validate_assignment=True): # type: ignore[ca
|
|
|
75
81
|
model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
|
|
76
82
|
name: str
|
|
77
83
|
entity_type: EntityTypeChoices
|
|
84
|
+
annotation_type: DbAnnotationType
|
|
85
|
+
task_name: Optional[str] = None # Not sure if adding task_name makes sense.
|
|
78
86
|
|
|
79
87
|
|
|
80
88
|
class DbAnnotatedObjectReport(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
@@ -82,10 +90,34 @@ class DbAnnotatedObjectReport(BaseModel, validate_assignment=True): # type: ign
|
|
|
82
90
|
obj: DbAnnotatedObject
|
|
83
91
|
unique_obj_ids: Optional[int] = None
|
|
84
92
|
obj_instances: Optional[int] = None
|
|
93
|
+
images_with_obj: Optional[int] = None
|
|
94
|
+
|
|
85
95
|
average_count_per_image: Optional[float] = None
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
96
|
+
|
|
97
|
+
area_avg_ratio: Optional[float] = None
|
|
98
|
+
area_min_ratio: Optional[float] = None
|
|
99
|
+
area_max_ratio: Optional[float] = None
|
|
100
|
+
|
|
101
|
+
height_avg_ratio: Optional[float] = None
|
|
102
|
+
height_min_ratio: Optional[float] = None
|
|
103
|
+
height_max_ratio: Optional[float] = None
|
|
104
|
+
|
|
105
|
+
width_avg_ratio: Optional[float] = None
|
|
106
|
+
width_min_ratio: Optional[float] = None
|
|
107
|
+
width_max_ratio: Optional[float] = None
|
|
108
|
+
|
|
109
|
+
area_avg_px: Optional[float] = None
|
|
110
|
+
area_min_px: Optional[int] = None
|
|
111
|
+
area_max_px: Optional[int] = None
|
|
112
|
+
|
|
113
|
+
height_avg_px: Optional[float] = None
|
|
114
|
+
height_min_px: Optional[int] = None
|
|
115
|
+
height_max_px: Optional[int] = None
|
|
116
|
+
|
|
117
|
+
width_avg_px: Optional[float] = None
|
|
118
|
+
width_min_px: Optional[int] = None
|
|
119
|
+
width_max_px: Optional[int] = None
|
|
120
|
+
|
|
89
121
|
annotation_type: Optional[List[DbAnnotationType]] = None
|
|
90
122
|
|
|
91
123
|
|
|
@@ -155,8 +187,78 @@ class EntityTypeChoices(str, Enum): # Should match `EntityTypeChoices` in `dipd
|
|
|
155
187
|
EVENT = "EVENT"
|
|
156
188
|
|
|
157
189
|
|
|
158
|
-
class
|
|
159
|
-
|
|
190
|
+
class Annotations(BaseModel):
|
|
191
|
+
"""
|
|
192
|
+
Used in 'DatasetImageMetadata' for visualizing image annotations
|
|
193
|
+
in gallery images on the dataset detail page.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
objects: Optional[List[Bbox]] = None
|
|
197
|
+
classifications: Optional[List[Classification]] = None
|
|
198
|
+
polygons: Optional[List[Polygon]] = None
|
|
199
|
+
bitmasks: Optional[List[Bitmask]] = None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class DatasetImageMetadata(BaseModel):
|
|
203
|
+
"""
|
|
204
|
+
Metadata for gallery images on the dataset detail page on portal.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
annotations: Optional[Annotations] = None
|
|
208
|
+
meta: Optional[Dict[str, Any]] = None
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def from_sample(cls, sample: Sample) -> "DatasetImageMetadata":
|
|
212
|
+
sample = sample.model_copy(deep=True)
|
|
213
|
+
sample.file_path = "/".join(Path(sample.file_path).parts[-3:])
|
|
214
|
+
metadata = {}
|
|
215
|
+
metadata_field_names = [
|
|
216
|
+
ColumnName.FILE_PATH,
|
|
217
|
+
ColumnName.HEIGHT,
|
|
218
|
+
ColumnName.WIDTH,
|
|
219
|
+
ColumnName.SPLIT,
|
|
220
|
+
]
|
|
221
|
+
for field_name in metadata_field_names:
|
|
222
|
+
if hasattr(sample, field_name) and getattr(sample, field_name) is not None:
|
|
223
|
+
metadata[field_name] = getattr(sample, field_name)
|
|
224
|
+
|
|
225
|
+
obj = DatasetImageMetadata(
|
|
226
|
+
annotations=Annotations(
|
|
227
|
+
objects=sample.objects,
|
|
228
|
+
classifications=sample.classifications,
|
|
229
|
+
polygons=sample.polygons,
|
|
230
|
+
bitmasks=sample.bitmasks,
|
|
231
|
+
),
|
|
232
|
+
meta=metadata,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return obj
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class DatasetImage(Attribution, validate_assignment=True): # type: ignore[call-arg]
|
|
239
|
+
img: str # Base64-encoded image string
|
|
240
|
+
order: Optional[int] = None
|
|
241
|
+
metadata: Optional[DatasetImageMetadata] = None
|
|
242
|
+
|
|
243
|
+
@field_validator("img", mode="before")
|
|
244
|
+
def validate_image_path(cls, v: Union[str, Path]) -> str:
|
|
245
|
+
if isinstance(v, Path):
|
|
246
|
+
v = path_image_to_base64_str(path_image=v)
|
|
247
|
+
|
|
248
|
+
if not isinstance(v, str):
|
|
249
|
+
raise ValueError("Image must be a string or Path object representing the image path.")
|
|
250
|
+
|
|
251
|
+
if not v.startswith("data:image/"):
|
|
252
|
+
raise ValueError("Image must be a base64-encoded data URL.")
|
|
253
|
+
|
|
254
|
+
return v
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def path_image_to_base64_str(path_image: Path) -> str:
|
|
258
|
+
image = Image.open(path_image)
|
|
259
|
+
mime_format = Image.MIME[image.format]
|
|
260
|
+
as_b64 = base64.b64encode(path_image.read_bytes()).decode("ascii")
|
|
261
|
+
return f"data:{mime_format};base64,{as_b64}"
|
|
160
262
|
|
|
161
263
|
|
|
162
264
|
class DbDistributionType(BaseModel, validate_assignment=True): # type: ignore[call-arg]
|
|
@@ -185,7 +287,10 @@ def get_folder_size(path: Path) -> int:
|
|
|
185
287
|
return sum([path.stat().st_size for path in path.rglob("*")])
|
|
186
288
|
|
|
187
289
|
|
|
188
|
-
def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset) -> dict:
|
|
290
|
+
def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset, upload_gallery_images: bool) -> dict:
|
|
291
|
+
if not upload_gallery_images:
|
|
292
|
+
dataset_update.imgs = None
|
|
293
|
+
|
|
189
294
|
cfg = Config()
|
|
190
295
|
dataset_details = dataset_update.model_dump_json()
|
|
191
296
|
data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
|
|
@@ -199,9 +304,9 @@ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
|
|
|
199
304
|
import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
|
|
200
305
|
headers = {"Authorization": cfg.api_key}
|
|
201
306
|
|
|
202
|
-
user_logger.info("
|
|
203
|
-
|
|
204
|
-
return
|
|
307
|
+
user_logger.info("Exporting dataset details to platform. This may take up to 30 seconds...")
|
|
308
|
+
response = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
|
|
309
|
+
return response # type: ignore[return-value]
|
|
205
310
|
|
|
206
311
|
|
|
207
312
|
def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
|
|
@@ -219,7 +324,6 @@ def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Ty
|
|
|
219
324
|
col_name = PrimitiveType.column_name()
|
|
220
325
|
table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
|
|
221
326
|
if col_name not in table.columns:
|
|
222
|
-
user_logger.warning(f"Warning: No field called '{col_name}' was found for '{PrimitiveType.__name__}'.")
|
|
223
327
|
return False
|
|
224
328
|
|
|
225
329
|
if table[col_name].dtype == pl.Null:
|
|
@@ -235,7 +339,7 @@ def calculate_distribution_values(
|
|
|
235
339
|
|
|
236
340
|
if len(distribution_tasks) == 0:
|
|
237
341
|
return []
|
|
238
|
-
classification_column =
|
|
342
|
+
classification_column = Classification.column_name()
|
|
239
343
|
classifications = dataset_split.select(pl.col(classification_column).explode())
|
|
240
344
|
classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
|
|
241
345
|
classifications = classifications.filter(
|
|
@@ -277,6 +381,8 @@ def dataset_info_from_dataset(
|
|
|
277
381
|
deployment_stage: DeploymentStage,
|
|
278
382
|
path_sample: Optional[Path],
|
|
279
383
|
path_hidden: Optional[Path],
|
|
384
|
+
path_gallery_images: Optional[Path] = None,
|
|
385
|
+
gallery_image_names: Optional[List[str]] = None,
|
|
280
386
|
) -> DbDataset:
|
|
281
387
|
dataset_variants = []
|
|
282
388
|
dataset_reports = []
|
|
@@ -292,6 +398,12 @@ def dataset_info_from_dataset(
|
|
|
292
398
|
if len(path_and_variant) == 0:
|
|
293
399
|
raise ValueError("At least one path must be provided for sample or hidden dataset.")
|
|
294
400
|
|
|
401
|
+
gallery_images = create_gallery_images(
|
|
402
|
+
dataset=dataset,
|
|
403
|
+
path_gallery_images=path_gallery_images,
|
|
404
|
+
gallery_image_names=gallery_image_names,
|
|
405
|
+
)
|
|
406
|
+
|
|
295
407
|
for path_dataset, variant_type in path_and_variant:
|
|
296
408
|
if variant_type == DatasetVariant.SAMPLE:
|
|
297
409
|
dataset_variant = dataset.create_sample_dataset()
|
|
@@ -331,19 +443,26 @@ def dataset_info_from_dataset(
|
|
|
331
443
|
)
|
|
332
444
|
|
|
333
445
|
object_reports: List[DbAnnotatedObjectReport] = []
|
|
334
|
-
primitive_columns = [
|
|
335
|
-
if has_primitive(dataset_split, PrimitiveType=
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
df_per_instance
|
|
340
|
-
|
|
341
|
-
|
|
446
|
+
primitive_columns = [primitive.column_name() for primitive in primitives.PRIMITIVE_TYPES]
|
|
447
|
+
if has_primitive(dataset_split, PrimitiveType=Bbox):
|
|
448
|
+
df_per_instance = table_transformations.create_primitive_table(
|
|
449
|
+
dataset_split, PrimitiveType=Bbox, keep_sample_data=True
|
|
450
|
+
)
|
|
451
|
+
if df_per_instance is None:
|
|
452
|
+
raise ValueError(f"Expected {Bbox.__name__} primitive column to be present in the dataset split.")
|
|
342
453
|
# Calculate area of bounding boxes
|
|
343
|
-
df_per_instance = df_per_instance.with_columns(
|
|
454
|
+
df_per_instance = df_per_instance.with_columns(
|
|
455
|
+
(pl.col("height") * pl.col("width")).alias("area"),
|
|
456
|
+
).with_columns(
|
|
457
|
+
(pl.col("height") * pl.col("image.height")).alias("height_px"),
|
|
458
|
+
(pl.col("width") * pl.col("image.width")).alias("width_px"),
|
|
459
|
+
(pl.col("area") * (pl.col("image.height") * pl.col("image.width"))).alias("area_px"),
|
|
460
|
+
)
|
|
344
461
|
|
|
345
462
|
annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
|
|
346
|
-
for (class_name,), class_group in df_per_instance.group_by(
|
|
463
|
+
for (class_name, task_name), class_group in df_per_instance.group_by(
|
|
464
|
+
FieldName.CLASS_NAME, FieldName.TASK_NAME
|
|
465
|
+
):
|
|
347
466
|
if class_name is None:
|
|
348
467
|
continue
|
|
349
468
|
object_reports.append(
|
|
@@ -351,25 +470,39 @@ def dataset_info_from_dataset(
|
|
|
351
470
|
obj=DbAnnotatedObject(
|
|
352
471
|
name=class_name,
|
|
353
472
|
entity_type=EntityTypeChoices.OBJECT.value,
|
|
473
|
+
annotation_type=annotation_type,
|
|
474
|
+
task_name=task_name,
|
|
354
475
|
),
|
|
355
476
|
unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
|
|
356
477
|
obj_instances=len(class_group),
|
|
357
478
|
annotation_type=[annotation_type],
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
479
|
+
images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
|
|
480
|
+
area_avg_ratio=class_group["area"].mean(),
|
|
481
|
+
area_min_ratio=class_group["area"].min(),
|
|
482
|
+
area_max_ratio=class_group["area"].max(),
|
|
483
|
+
height_avg_ratio=class_group["height"].mean(),
|
|
484
|
+
height_min_ratio=class_group["height"].min(),
|
|
485
|
+
height_max_ratio=class_group["height"].max(),
|
|
486
|
+
width_avg_ratio=class_group["width"].mean(),
|
|
487
|
+
width_min_ratio=class_group["width"].min(),
|
|
488
|
+
width_max_ratio=class_group["width"].max(),
|
|
489
|
+
area_avg_px=class_group["area_px"].mean(),
|
|
490
|
+
area_min_px=int(class_group["area_px"].min()),
|
|
491
|
+
area_max_px=int(class_group["area_px"].max()),
|
|
492
|
+
height_avg_px=class_group["height_px"].mean(),
|
|
493
|
+
height_min_px=int(class_group["height_px"].min()),
|
|
494
|
+
height_max_px=int(class_group["height_px"].max()),
|
|
495
|
+
width_avg_px=class_group["width_px"].mean(),
|
|
496
|
+
width_min_px=int(class_group["width_px"].min()),
|
|
497
|
+
width_max_px=int(class_group["width_px"].max()),
|
|
361
498
|
average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
|
|
362
499
|
)
|
|
363
500
|
)
|
|
364
501
|
|
|
365
|
-
if has_primitive(dataset_split, PrimitiveType=
|
|
502
|
+
if has_primitive(dataset_split, PrimitiveType=Classification):
|
|
366
503
|
annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
|
|
367
|
-
col_name =
|
|
368
|
-
classification_tasks = [
|
|
369
|
-
task.name
|
|
370
|
-
for task in dataset.info.tasks
|
|
371
|
-
if task.primitive == hafnia.dataset.primitives.classification.Classification
|
|
372
|
-
]
|
|
504
|
+
col_name = Classification.column_name()
|
|
505
|
+
classification_tasks = [task.name for task in dataset.info.tasks if task.primitive == Classification]
|
|
373
506
|
has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
|
|
374
507
|
if has_classification_data:
|
|
375
508
|
classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
|
|
@@ -385,7 +518,7 @@ def dataset_info_from_dataset(
|
|
|
385
518
|
), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
|
|
386
519
|
if class_name is None:
|
|
387
520
|
continue
|
|
388
|
-
if task_name ==
|
|
521
|
+
if task_name == Classification.default_task_name():
|
|
389
522
|
display_name = class_name # Prefix class name with task name
|
|
390
523
|
else:
|
|
391
524
|
display_name = f"{task_name}.{class_name}"
|
|
@@ -394,6 +527,8 @@ def dataset_info_from_dataset(
|
|
|
394
527
|
obj=DbAnnotatedObject(
|
|
395
528
|
name=display_name,
|
|
396
529
|
entity_type=EntityTypeChoices.EVENT.value,
|
|
530
|
+
annotation_type=annotation_type,
|
|
531
|
+
task_name=task_name,
|
|
397
532
|
),
|
|
398
533
|
unique_obj_ids=len(
|
|
399
534
|
class_group
|
|
@@ -403,22 +538,32 @@ def dataset_info_from_dataset(
|
|
|
403
538
|
)
|
|
404
539
|
)
|
|
405
540
|
|
|
406
|
-
if has_primitive(dataset_split, PrimitiveType=
|
|
541
|
+
if has_primitive(dataset_split, PrimitiveType=Segmentation):
|
|
407
542
|
raise NotImplementedError("Not Implemented yet")
|
|
408
543
|
|
|
409
|
-
if has_primitive(dataset_split, PrimitiveType=
|
|
410
|
-
col_name =
|
|
544
|
+
if has_primitive(dataset_split, PrimitiveType=Bitmask):
|
|
545
|
+
col_name = Bitmask.column_name()
|
|
411
546
|
drop_columns = [col for col in primitive_columns if col != col_name]
|
|
412
547
|
drop_columns.append(FieldName.META)
|
|
413
|
-
df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
|
|
414
|
-
df_per_instance = df_per_instance.explode(col_name).drop(drop_columns).unnest(col_name)
|
|
415
548
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
549
|
+
df_per_instance = table_transformations.create_primitive_table(
|
|
550
|
+
dataset_split, PrimitiveType=Bitmask, keep_sample_data=True
|
|
551
|
+
)
|
|
552
|
+
if df_per_instance is None:
|
|
553
|
+
raise ValueError(
|
|
554
|
+
f"Expected {Bitmask.__name__} primitive column to be present in the dataset split."
|
|
555
|
+
)
|
|
556
|
+
df_per_instance = df_per_instance.rename({"height": "height_px", "width": "width_px"})
|
|
557
|
+
df_per_instance = df_per_instance.with_columns(
|
|
558
|
+
(pl.col("image.height") * pl.col("image.width") * pl.col("area")).alias("area_px"),
|
|
559
|
+
(pl.col("height_px") / pl.col("image.height")).alias("height"),
|
|
560
|
+
(pl.col("width_px") / pl.col("image.width")).alias("width"),
|
|
561
|
+
)
|
|
419
562
|
|
|
420
563
|
annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
|
|
421
|
-
for (class_name,), class_group in df_per_instance.group_by(
|
|
564
|
+
for (class_name, task_name), class_group in df_per_instance.group_by(
|
|
565
|
+
FieldName.CLASS_NAME, FieldName.TASK_NAME
|
|
566
|
+
):
|
|
422
567
|
if class_name is None:
|
|
423
568
|
continue
|
|
424
569
|
object_reports.append(
|
|
@@ -426,18 +571,36 @@ def dataset_info_from_dataset(
|
|
|
426
571
|
obj=DbAnnotatedObject(
|
|
427
572
|
name=class_name,
|
|
428
573
|
entity_type=EntityTypeChoices.OBJECT.value,
|
|
574
|
+
annotation_type=annotation_type,
|
|
575
|
+
task_name=task_name,
|
|
429
576
|
),
|
|
430
577
|
unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
|
|
431
578
|
obj_instances=len(class_group),
|
|
432
579
|
annotation_type=[annotation_type],
|
|
433
580
|
average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
581
|
+
images_with_obj=class_group[ColumnName.SAMPLE_INDEX].n_unique(),
|
|
582
|
+
area_avg_ratio=class_group["area"].mean(),
|
|
583
|
+
area_min_ratio=class_group["area"].min(),
|
|
584
|
+
area_max_ratio=class_group["area"].max(),
|
|
585
|
+
height_avg_ratio=class_group["height"].mean(),
|
|
586
|
+
height_min_ratio=class_group["height"].min(),
|
|
587
|
+
height_max_ratio=class_group["height"].max(),
|
|
588
|
+
width_avg_ratio=class_group["width"].mean(),
|
|
589
|
+
width_min_ratio=class_group["width"].min(),
|
|
590
|
+
width_max_ratio=class_group["width"].max(),
|
|
591
|
+
area_avg_px=class_group["area_px"].mean(),
|
|
592
|
+
area_min_px=int(class_group["area_px"].min()),
|
|
593
|
+
area_max_px=int(class_group["area_px"].max()),
|
|
594
|
+
height_avg_px=class_group["height_px"].mean(),
|
|
595
|
+
height_min_px=int(class_group["height_px"].min()),
|
|
596
|
+
height_max_px=int(class_group["height_px"].max()),
|
|
597
|
+
width_avg_px=class_group["width_px"].mean(),
|
|
598
|
+
width_min_px=int(class_group["width_px"].min()),
|
|
599
|
+
width_max_px=int(class_group["width_px"].max()),
|
|
437
600
|
)
|
|
438
601
|
)
|
|
439
602
|
|
|
440
|
-
if has_primitive(dataset_split, PrimitiveType=
|
|
603
|
+
if has_primitive(dataset_split, PrimitiveType=Polygon):
|
|
441
604
|
raise NotImplementedError("Not Implemented yet")
|
|
442
605
|
|
|
443
606
|
# Sort object reports by name to more easily compare between versions
|
|
@@ -456,13 +619,59 @@ def dataset_info_from_dataset(
|
|
|
456
619
|
s3_bucket_name=bucket_sample,
|
|
457
620
|
dataset_variants=dataset_variants,
|
|
458
621
|
split_annotations_reports=dataset_reports,
|
|
459
|
-
|
|
622
|
+
latest_update=dataset.info.updated_at,
|
|
623
|
+
dataset_format_version=dataset.info.format_version,
|
|
624
|
+
license_citation=dataset.info.reference_bibtex,
|
|
460
625
|
data_captured_start=dataset_meta_info.get("data_captured_start", None),
|
|
461
626
|
data_captured_end=dataset_meta_info.get("data_captured_end", None),
|
|
462
627
|
data_received_start=dataset_meta_info.get("data_received_start", None),
|
|
463
628
|
data_received_end=dataset_meta_info.get("data_received_end", None),
|
|
464
629
|
annotation_project_id=dataset_meta_info.get("annotation_project_id", None),
|
|
465
630
|
annotation_dataset_id=dataset_meta_info.get("annotation_dataset_id", None),
|
|
631
|
+
imgs=gallery_images,
|
|
466
632
|
)
|
|
467
633
|
|
|
468
634
|
return dataset_info
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def create_gallery_images(
|
|
638
|
+
dataset: HafniaDataset,
|
|
639
|
+
path_gallery_images: Optional[Path],
|
|
640
|
+
gallery_image_names: Optional[List[str]],
|
|
641
|
+
) -> Optional[List[DatasetImage]]:
|
|
642
|
+
gallery_images = None
|
|
643
|
+
if (gallery_image_names is not None) and (len(gallery_image_names) > 0):
|
|
644
|
+
if path_gallery_images is None:
|
|
645
|
+
raise ValueError("Path to gallery images must be provided.")
|
|
646
|
+
path_gallery_images.mkdir(parents=True, exist_ok=True)
|
|
647
|
+
COL_IMAGE_NAME = "image_name"
|
|
648
|
+
samples = dataset.samples.with_columns(
|
|
649
|
+
dataset.samples[ColumnName.FILE_PATH].str.split("/").list.last().alias(COL_IMAGE_NAME)
|
|
650
|
+
)
|
|
651
|
+
gallery_samples = samples.filter(pl.col(COL_IMAGE_NAME).is_in(gallery_image_names))
|
|
652
|
+
|
|
653
|
+
missing_gallery_samples = set(gallery_image_names) - set(gallery_samples[COL_IMAGE_NAME])
|
|
654
|
+
if len(missing_gallery_samples):
|
|
655
|
+
raise ValueError(f"Gallery images not found in dataset: {missing_gallery_samples}")
|
|
656
|
+
gallery_images = []
|
|
657
|
+
for gallery_sample in gallery_samples.iter_rows(named=True):
|
|
658
|
+
sample = Sample(**gallery_sample)
|
|
659
|
+
|
|
660
|
+
metadata = DatasetImageMetadata.from_sample(sample=sample)
|
|
661
|
+
sample.classifications = None # To not draw classifications in gallery images
|
|
662
|
+
image = sample.draw_annotations()
|
|
663
|
+
|
|
664
|
+
path_gallery_image = path_gallery_images / gallery_sample[COL_IMAGE_NAME]
|
|
665
|
+
Image.fromarray(image).save(path_gallery_image)
|
|
666
|
+
|
|
667
|
+
dataset_image_dict = {
|
|
668
|
+
"img": path_gallery_image,
|
|
669
|
+
"metadata": metadata,
|
|
670
|
+
}
|
|
671
|
+
if sample.attribution is not None:
|
|
672
|
+
sample.attribution.changes = "Annotations have been visualized"
|
|
673
|
+
dataset_image_dict.update(sample.attribution.model_dump(exclude_none=True))
|
|
674
|
+
gallery_img = DatasetImage(**dataset_image_dict)
|
|
675
|
+
gallery_img.licenses = gallery_img.licenses or []
|
|
676
|
+
gallery_images.append(gallery_img)
|
|
677
|
+
return gallery_images
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
import more_itertools
|
|
6
|
+
import polars as pl
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from rich.progress import track
|
|
9
|
+
|
|
10
|
+
from hafnia.dataset.dataset_names import ColumnName, FieldName
|
|
11
|
+
from hafnia.dataset.hafnia_dataset import DatasetInfo, HafniaDataset, Sample, TaskInfo
|
|
12
|
+
from hafnia.dataset.primitives import Classification
|
|
13
|
+
from hafnia.utils import is_image_file
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def import_image_classification_directory_tree(
|
|
17
|
+
path_folder: Path,
|
|
18
|
+
split: str,
|
|
19
|
+
n_samples: Optional[int] = None,
|
|
20
|
+
) -> HafniaDataset:
|
|
21
|
+
class_folder_paths = [path for path in path_folder.iterdir() if path.is_dir()]
|
|
22
|
+
class_names = sorted([folder.name for folder in class_folder_paths]) # Sort for determinism
|
|
23
|
+
|
|
24
|
+
# Gather all image paths per class
|
|
25
|
+
path_images_per_class: List[List[Path]] = []
|
|
26
|
+
for path_class_folder in class_folder_paths:
|
|
27
|
+
per_class_images = []
|
|
28
|
+
for path_image in list(path_class_folder.rglob("*.*")):
|
|
29
|
+
if is_image_file(path_image):
|
|
30
|
+
per_class_images.append(path_image)
|
|
31
|
+
path_images_per_class.append(sorted(per_class_images))
|
|
32
|
+
|
|
33
|
+
# Interleave to ensure classes are balanced in the output dataset for n_samples < total
|
|
34
|
+
path_images = list(more_itertools.interleave_longest(*path_images_per_class))
|
|
35
|
+
|
|
36
|
+
if n_samples is not None:
|
|
37
|
+
path_images = path_images[:n_samples]
|
|
38
|
+
|
|
39
|
+
samples = []
|
|
40
|
+
for path_image_org in track(path_images, description="Convert 'image classification' dataset to Hafnia Dataset"):
|
|
41
|
+
class_name = path_image_org.parent.name
|
|
42
|
+
|
|
43
|
+
read_image = Image.open(path_image_org)
|
|
44
|
+
width, height = read_image.size
|
|
45
|
+
|
|
46
|
+
classifications = [Classification(class_name=class_name, class_idx=class_names.index(class_name))]
|
|
47
|
+
sample = Sample(
|
|
48
|
+
file_path=str(path_image_org.absolute()),
|
|
49
|
+
width=width,
|
|
50
|
+
height=height,
|
|
51
|
+
split=split,
|
|
52
|
+
classifications=classifications,
|
|
53
|
+
)
|
|
54
|
+
samples.append(sample)
|
|
55
|
+
|
|
56
|
+
dataset_info = DatasetInfo(
|
|
57
|
+
dataset_name="ImageClassificationFromDirectoryTree",
|
|
58
|
+
tasks=[TaskInfo(primitive=Classification, class_names=class_names)],
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
hafnia_dataset = HafniaDataset.from_samples_list(samples_list=samples, info=dataset_info)
|
|
62
|
+
return hafnia_dataset
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def export_image_classification_directory_tree(
|
|
66
|
+
dataset: HafniaDataset,
|
|
67
|
+
path_output: Path,
|
|
68
|
+
task_name: Optional[str] = None,
|
|
69
|
+
clean_folder: bool = False,
|
|
70
|
+
) -> Path:
|
|
71
|
+
task = dataset.info.get_task_by_task_name_and_primitive(task_name=task_name, primitive=Classification)
|
|
72
|
+
|
|
73
|
+
samples = dataset.samples.with_columns(
|
|
74
|
+
pl.col(task.primitive.column_name())
|
|
75
|
+
.list.filter(pl.element().struct.field(FieldName.TASK_NAME) == task.name)
|
|
76
|
+
.alias(task.primitive.column_name())
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
classification_counts = samples[task.primitive.column_name()].list.len()
|
|
80
|
+
has_no_classification_samples = (classification_counts == 0).sum()
|
|
81
|
+
if has_no_classification_samples > 0:
|
|
82
|
+
raise ValueError(f"Some samples do not have a classification for task '{task.name}'.")
|
|
83
|
+
|
|
84
|
+
has_multi_classification_samples = (classification_counts > 1).sum()
|
|
85
|
+
if has_multi_classification_samples > 0:
|
|
86
|
+
raise ValueError(f"Some samples have multiple classifications for task '{task.name}'.")
|
|
87
|
+
|
|
88
|
+
if clean_folder:
|
|
89
|
+
shutil.rmtree(path_output, ignore_errors=True)
|
|
90
|
+
path_output.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
description = "Export Hafnia Dataset to directory tree"
|
|
93
|
+
for sample_dict in track(samples.iter_rows(named=True), total=len(samples), description=description):
|
|
94
|
+
classifications = sample_dict[task.primitive.column_name()]
|
|
95
|
+
if len(classifications) != 1:
|
|
96
|
+
raise ValueError("Each sample should have exactly one classification.")
|
|
97
|
+
classification = classifications[0]
|
|
98
|
+
class_name = classification[FieldName.CLASS_NAME].replace("/", "_") # Avoid issues with subfolders
|
|
99
|
+
path_class_folder = path_output / class_name
|
|
100
|
+
path_class_folder.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
path_image_org = Path(sample_dict[ColumnName.FILE_PATH])
|
|
103
|
+
path_image_new = path_class_folder / path_image_org.name
|
|
104
|
+
shutil.copy2(path_image_org, path_image_new)
|
|
105
|
+
|
|
106
|
+
return path_output
|