hafnia 0.1.27__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cli/__main__.py +2 -2
  2. cli/config.py +17 -4
  3. cli/dataset_cmds.py +60 -0
  4. cli/runc_cmds.py +1 -1
  5. hafnia/data/__init__.py +2 -2
  6. hafnia/data/factory.py +12 -56
  7. hafnia/dataset/dataset_helpers.py +91 -0
  8. hafnia/dataset/dataset_names.py +72 -0
  9. hafnia/dataset/dataset_recipe/dataset_recipe.py +327 -0
  10. hafnia/dataset/dataset_recipe/recipe_transforms.py +53 -0
  11. hafnia/dataset/dataset_recipe/recipe_types.py +140 -0
  12. hafnia/dataset/dataset_upload_helper.py +468 -0
  13. hafnia/dataset/hafnia_dataset.py +624 -0
  14. hafnia/dataset/operations/dataset_stats.py +15 -0
  15. hafnia/dataset/operations/dataset_transformations.py +82 -0
  16. hafnia/dataset/operations/table_transformations.py +183 -0
  17. hafnia/dataset/primitives/__init__.py +16 -0
  18. hafnia/dataset/primitives/bbox.py +137 -0
  19. hafnia/dataset/primitives/bitmask.py +182 -0
  20. hafnia/dataset/primitives/classification.py +56 -0
  21. hafnia/dataset/primitives/point.py +25 -0
  22. hafnia/dataset/primitives/polygon.py +100 -0
  23. hafnia/dataset/primitives/primitive.py +44 -0
  24. hafnia/dataset/primitives/segmentation.py +51 -0
  25. hafnia/dataset/primitives/utils.py +51 -0
  26. hafnia/experiment/hafnia_logger.py +7 -7
  27. hafnia/helper_testing.py +108 -0
  28. hafnia/http.py +5 -3
  29. hafnia/platform/__init__.py +2 -2
  30. hafnia/platform/datasets.py +197 -0
  31. hafnia/platform/download.py +85 -23
  32. hafnia/torch_helpers.py +180 -95
  33. hafnia/utils.py +21 -2
  34. hafnia/visualizations/colors.py +267 -0
  35. hafnia/visualizations/image_visualizations.py +202 -0
  36. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/METADATA +209 -99
  37. hafnia-0.2.1.dist-info/RECORD +50 -0
  38. cli/data_cmds.py +0 -53
  39. hafnia-0.1.27.dist-info/RECORD +0 -27
  40. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/WHEEL +0 -0
  41. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/entry_points.txt +0 -0
  42. {hafnia-0.1.27.dist-info → hafnia-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,468 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Tuple, Type, Union
7
+
8
+ import boto3
9
+ import polars as pl
10
+ from pydantic import BaseModel, ConfigDict
11
+
12
+ import hafnia.dataset.primitives.bbox
13
+ import hafnia.dataset.primitives.bitmask
14
+ import hafnia.dataset.primitives.classification
15
+ import hafnia.dataset.primitives.polygon
16
+ import hafnia.dataset.primitives.segmentation
17
+ from cli.config import Config
18
+ from hafnia.dataset import primitives
19
+ from hafnia.dataset.dataset_names import (
20
+ ColumnName,
21
+ DatasetVariant,
22
+ DeploymentStage,
23
+ FieldName,
24
+ SplitName,
25
+ )
26
+ from hafnia.dataset.hafnia_dataset import HafniaDataset, TaskInfo
27
+ from hafnia.dataset.primitives.primitive import Primitive
28
+ from hafnia.http import post
29
+ from hafnia.log import user_logger
30
+ from hafnia.platform import get_dataset_id
31
+
32
+
33
+ def generate_bucket_name(dataset_name: str, deployment_stage: DeploymentStage) -> str:
34
+ # TODO: When moving to versioning we do NOT need 'staging' and 'production' specific buckets
35
+ # and the new name convention should be: f"hafnia-dataset-{dataset_name}"
36
+ return f"mdi-{deployment_stage.value}-{dataset_name}"
37
+
38
+
39
+ class DbDataset(BaseModel, validate_assignment=True): # type: ignore[call-arg]
40
+ model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
41
+ name: str
42
+ data_captured_start: Optional[datetime] = None
43
+ data_captured_end: Optional[datetime] = None
44
+ data_received_start: Optional[datetime] = None
45
+ data_received_end: Optional[datetime] = None
46
+ latest_update: Optional[datetime] = None
47
+ license_citation: Optional[str] = None
48
+ version: Optional[str] = None
49
+ s3_bucket_name: Optional[str] = None
50
+ annotation_date: Optional[datetime] = None
51
+ annotation_project_id: Optional[str] = None
52
+ annotation_dataset_id: Optional[str] = None
53
+ annotation_ontology: Optional[str] = None
54
+ dataset_variants: Optional[List[DbDatasetVariant]] = None
55
+ split_annotations_reports: Optional[List[DbSplitAnnotationsReport]] = None
56
+ dataset_images: Optional[List[DatasetImage]] = None
57
+
58
+
59
+ class DbDatasetVariant(BaseModel, validate_assignment=True): # type: ignore[call-arg]
60
+ model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
61
+ variant_type: VariantTypeChoices # Required
62
+ upload_date: Optional[datetime] = None
63
+ size_bytes: Optional[int] = None
64
+ data_type: Optional[str] = None
65
+ number_of_data_items: Optional[int] = None
66
+ resolutions: Optional[List[DbResolution]] = None
67
+ duration: Optional[float] = None
68
+ duration_average: Optional[float] = None
69
+ frame_rate: Optional[float] = None
70
+ bit_rate: Optional[float] = None
71
+ n_cameras: Optional[int] = None
72
+
73
+
74
+ class DbAnnotatedObject(BaseModel, validate_assignment=True): # type: ignore[call-arg]
75
+ model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
76
+ name: str
77
+ entity_type: EntityTypeChoices
78
+
79
+
80
+ class DbAnnotatedObjectReport(BaseModel, validate_assignment=True): # type: ignore[call-arg]
81
+ model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
82
+ obj: DbAnnotatedObject
83
+ unique_obj_ids: Optional[int] = None
84
+ obj_instances: Optional[int] = None
85
+ average_count_per_image: Optional[float] = None
86
+ avg_area: Optional[float] = None
87
+ min_area: Optional[float] = None
88
+ max_area: Optional[float] = None
89
+ annotation_type: Optional[List[DbAnnotationType]] = None
90
+
91
+
92
+ class DbDistributionValue(BaseModel, validate_assignment=True): # type: ignore[call-arg]
93
+ distribution_category: DbDistributionCategory
94
+ percentage: Optional[float] = None
95
+
96
+ @staticmethod
97
+ def from_names(type_name: str, category_name: str, percentage: Optional[float]) -> "DbDistributionValue":
98
+ dist_type = DbDistributionType(name=type_name)
99
+ dist_category = DbDistributionCategory(distribution_type=dist_type, name=category_name)
100
+ return DbDistributionValue(distribution_category=dist_category, percentage=percentage)
101
+
102
+
103
+ class DbSplitAnnotationsReport(BaseModel, validate_assignment=True): # type: ignore[call-arg]
104
+ model_config = ConfigDict(use_enum_values=True) # To parse Enum values as strings
105
+ variant_type: VariantTypeChoices # Required
106
+ split: str # Required
107
+ sample_count: Optional[int] = None
108
+ annotated_object_reports: Optional[List[DbAnnotatedObjectReport]] = None
109
+ distribution_values: Optional[List[DbDistributionValue]] = None
110
+
111
+
112
+ class DbDistributionCategory(BaseModel, validate_assignment=True): # type: ignore[call-arg]
113
+ distribution_type: DbDistributionType
114
+ name: str
115
+
116
+
117
+ class DbAnnotationType(BaseModel, validate_assignment=True): # type: ignore[call-arg]
118
+ name: str
119
+
120
+
121
+ class AnnotationType(Enum):
122
+ ImageClassification = "Image Classification"
123
+ ObjectDetection = "Object Detection"
124
+ SegmentationMask = "Segmentation Mask"
125
+ ImageCaptioning = "Image Captioning"
126
+ InstanceSegmentation = "Instance Segmentation"
127
+
128
+
129
+ class DbResolution(BaseModel, validate_assignment=True): # type: ignore[call-arg]
130
+ height: int
131
+ width: int
132
+
133
+
134
+ class DataTypeChoices(str, Enum): # Should match `DataTypeChoices` in `dipdatalib::src/apps/datasets/models.py`
135
+ images = "images"
136
+ video_frames = "video_frames"
137
+ video_clips = "video_clips"
138
+
139
+
140
+ class VariantTypeChoices(str, Enum): # Should match `VariantType` in `dipdatalib::src/apps/datasets/models.py`
141
+ ORIGINAL = "original"
142
+ HIDDEN = "hidden"
143
+ SAMPLE = "sample"
144
+
145
+
146
+ class SplitChoices(str, Enum): # Should match `SplitChoices` in `dipdatalib::src/apps/datasets/models.py`
147
+ FULL = "full"
148
+ TRAIN = "train"
149
+ TEST = "test"
150
+ VALIDATION = "validation"
151
+
152
+
153
+ class EntityTypeChoices(str, Enum): # Should match `EntityTypeChoices` in `dipdatalib::src/apps/datasets/models.py`
154
+ OBJECT = "OBJECT"
155
+ EVENT = "EVENT"
156
+
157
+
158
+ class DatasetImage(BaseModel, validate_assignment=True): # type: ignore[call-arg]
159
+ img: str
160
+
161
+
162
+ class DbDistributionType(BaseModel, validate_assignment=True): # type: ignore[call-arg]
163
+ name: str
164
+
165
+
166
+ VARIANT_TYPE_MAPPING: Dict[
167
+ DatasetVariant, VariantTypeChoices
168
+ ] = { # Conider making DatasetVariant & VariantTypeChoices into one
169
+ DatasetVariant.DUMP: VariantTypeChoices.ORIGINAL,
170
+ DatasetVariant.HIDDEN: VariantTypeChoices.HIDDEN,
171
+ DatasetVariant.SAMPLE: VariantTypeChoices.SAMPLE,
172
+ }
173
+
174
+ SPLIT_CHOICE_MAPPING: Dict[SplitChoices, List[str]] = {
175
+ SplitChoices.FULL: SplitName.valid_splits(),
176
+ SplitChoices.TRAIN: [SplitName.TRAIN],
177
+ SplitChoices.TEST: [SplitName.TEST],
178
+ SplitChoices.VALIDATION: [SplitName.VAL],
179
+ }
180
+
181
+
182
+ def get_folder_size(path: Path) -> int:
183
+ if not path.exists():
184
+ raise FileNotFoundError(f"The path {path} does not exist.")
185
+ return sum([path.stat().st_size for path in path.rglob("*")])
186
+
187
+
188
+ def upload_to_hafnia_dataset_detail_page(dataset_update: DbDataset) -> dict:
189
+ cfg = Config()
190
+ dataset_details = dataset_update.model_dump_json()
191
+ data = upload_dataset_details(cfg=cfg, data=dataset_details, dataset_name=dataset_update.name)
192
+ return data
193
+
194
+
195
+ def upload_dataset_details(cfg: Config, data: str, dataset_name: str) -> dict:
196
+ dataset_endpoint = cfg.get_platform_endpoint("datasets")
197
+ dataset_id = get_dataset_id(dataset_name, dataset_endpoint, cfg.api_key)
198
+
199
+ import_endpoint = f"{dataset_endpoint}/{dataset_id}/import"
200
+ headers = {"Authorization": cfg.api_key}
201
+
202
+ user_logger.info("Importing dataset details. This may take up to 30 seconds...")
203
+ data = post(endpoint=import_endpoint, headers=headers, data=data) # type: ignore[assignment]
204
+ return data # type: ignore[return-value]
205
+
206
+
207
+ def get_resolutions(dataset: HafniaDataset, max_resolutions_selected: int = 8) -> List[DbResolution]:
208
+ unique_resolutions = (
209
+ dataset.samples.select([pl.col("height"), pl.col("width")]).unique().sort(by=["height", "width"])
210
+ )
211
+ if len(unique_resolutions) > max_resolutions_selected:
212
+ skip_size = len(unique_resolutions) // max_resolutions_selected
213
+ unique_resolutions = unique_resolutions.gather_every(skip_size)
214
+ resolutions = [DbResolution(height=res["height"], width=res["width"]) for res in unique_resolutions.to_dicts()]
215
+ return resolutions
216
+
217
+
218
+ def has_primitive(dataset: Union[HafniaDataset, pl.DataFrame], PrimitiveType: Type[Primitive]) -> bool:
219
+ col_name = PrimitiveType.column_name()
220
+ table = dataset.samples if isinstance(dataset, HafniaDataset) else dataset
221
+ if col_name not in table.columns:
222
+ user_logger.warning(f"Warning: No field called '{col_name}' was found for '{PrimitiveType.__name__}'.")
223
+ return False
224
+
225
+ if table[col_name].dtype == pl.Null:
226
+ return False
227
+
228
+ return True
229
+
230
+
231
+ def calculate_distribution_values(
232
+ dataset_split: pl.DataFrame, distribution_tasks: Optional[List[TaskInfo]]
233
+ ) -> List[DbDistributionValue]:
234
+ distribution_tasks = distribution_tasks or []
235
+
236
+ if len(distribution_tasks) == 0:
237
+ return []
238
+ classification_column = hafnia.dataset.primitives.classification.Classification.column_name()
239
+ classifications = dataset_split.select(pl.col(classification_column).explode())
240
+ classifications = classifications.filter(pl.col(classification_column).is_not_null()).unnest(classification_column)
241
+ classifications = classifications.filter(
242
+ pl.col(FieldName.TASK_NAME).is_in([task.name for task in distribution_tasks])
243
+ )
244
+ dist_values = []
245
+ for (task_name,), task_group in classifications.group_by(FieldName.TASK_NAME):
246
+ distribution_type = DbDistributionType(name=task_name)
247
+ n_annotated_total = len(task_group)
248
+ for (class_name,), class_group in task_group.group_by(FieldName.CLASS_NAME):
249
+ class_count = len(class_group)
250
+
251
+ dist_values.append(
252
+ DbDistributionValue(
253
+ distribution_category=DbDistributionCategory(distribution_type=distribution_type, name=class_name),
254
+ percentage=class_count / n_annotated_total * 100,
255
+ )
256
+ )
257
+ dist_values = sorted(
258
+ dist_values,
259
+ key=lambda x: (
260
+ x.distribution_category.distribution_type.name,
261
+ x.distribution_category.name,
262
+ ),
263
+ )
264
+ return dist_values
265
+
266
+
267
+ def s3_based_fields(bucket_name: str, variant_type: DatasetVariant, session: boto3.Session) -> tuple[datetime, int]:
268
+ client = session.client("s3")
269
+ file_objects = client.list_objects_v2(Bucket=bucket_name, Prefix=variant_type.value)["Contents"]
270
+ last_modified = sorted([file_obj["LastModified"] for file_obj in file_objects])[-1]
271
+ size = sum([file_obj["Size"] for file_obj in file_objects])
272
+ return last_modified, size
273
+
274
+
275
+ def dataset_info_from_dataset(
276
+ dataset: HafniaDataset,
277
+ deployment_stage: DeploymentStage,
278
+ path_sample: Optional[Path],
279
+ path_hidden: Optional[Path],
280
+ ) -> DbDataset:
281
+ dataset_variants = []
282
+ dataset_reports = []
283
+ dataset_meta_info = dataset.info.meta or {}
284
+
285
+ path_and_variant: List[Tuple[Path, DatasetVariant]] = []
286
+ if path_sample is not None:
287
+ path_and_variant.append((path_sample, DatasetVariant.SAMPLE))
288
+
289
+ if path_hidden is not None:
290
+ path_and_variant.append((path_hidden, DatasetVariant.HIDDEN))
291
+
292
+ if len(path_and_variant) == 0:
293
+ raise ValueError("At least one path must be provided for sample or hidden dataset.")
294
+
295
+ for path_dataset, variant_type in path_and_variant:
296
+ if variant_type == DatasetVariant.SAMPLE:
297
+ dataset_variant = dataset.create_sample_dataset()
298
+ else:
299
+ dataset_variant = dataset
300
+
301
+ size_bytes = get_folder_size(path_dataset)
302
+ dataset_variants.append(
303
+ DbDatasetVariant(
304
+ variant_type=VARIANT_TYPE_MAPPING[variant_type], # type: ignore[index]
305
+ # upload_date: Optional[datetime] = None
306
+ size_bytes=size_bytes,
307
+ data_type=DataTypeChoices.images,
308
+ number_of_data_items=len(dataset_variant),
309
+ resolutions=get_resolutions(dataset_variant, max_resolutions_selected=8),
310
+ duration=dataset_meta_info.get("duration", None),
311
+ duration_average=dataset_meta_info.get("duration_average", None),
312
+ frame_rate=dataset_meta_info.get("frame_rate", None),
313
+ # bit_rate: Optional[float] = None
314
+ n_cameras=dataset_meta_info.get("n_cameras", None),
315
+ )
316
+ )
317
+
318
+ for split_name in SplitChoices:
319
+ split_names = SPLIT_CHOICE_MAPPING[split_name]
320
+ dataset_split = dataset_variant.samples.filter(pl.col(ColumnName.SPLIT).is_in(split_names))
321
+
322
+ distribution_values = calculate_distribution_values(
323
+ dataset_split=dataset_split,
324
+ distribution_tasks=dataset.info.distributions,
325
+ )
326
+ report = DbSplitAnnotationsReport(
327
+ variant_type=VARIANT_TYPE_MAPPING[variant_type], # type: ignore[index]
328
+ split=split_name,
329
+ sample_count=len(dataset_split),
330
+ distribution_values=distribution_values,
331
+ )
332
+
333
+ object_reports: List[DbAnnotatedObjectReport] = []
334
+ primitive_columns = [tPrimtive.column_name() for tPrimtive in primitives.PRIMITIVE_TYPES]
335
+ if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bbox.Bbox):
336
+ bbox_column_name = hafnia.dataset.primitives.bbox.Bbox.column_name()
337
+ drop_columns = [col for col in primitive_columns if col != bbox_column_name]
338
+ drop_columns.append(FieldName.META)
339
+ df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
340
+ df_per_instance = df_per_instance.explode(bbox_column_name).drop(drop_columns).unnest(bbox_column_name)
341
+
342
+ # Calculate area of bounding boxes
343
+ df_per_instance = df_per_instance.with_columns((pl.col("height") * pl.col("width")).alias("area"))
344
+
345
+ annotation_type = DbAnnotationType(name=AnnotationType.ObjectDetection.value)
346
+ for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
347
+ if class_name is None:
348
+ continue
349
+ object_reports.append(
350
+ DbAnnotatedObjectReport(
351
+ obj=DbAnnotatedObject(
352
+ name=class_name,
353
+ entity_type=EntityTypeChoices.OBJECT.value,
354
+ ),
355
+ unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
356
+ obj_instances=len(class_group),
357
+ annotation_type=[annotation_type],
358
+ avg_area=class_group["area"].mean(),
359
+ min_area=class_group["area"].min(),
360
+ max_area=class_group["area"].max(),
361
+ average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
362
+ )
363
+ )
364
+
365
+ if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.classification.Classification):
366
+ annotation_type = DbAnnotationType(name=AnnotationType.ImageClassification.value)
367
+ col_name = hafnia.dataset.primitives.classification.Classification.column_name()
368
+ classification_tasks = [
369
+ task.name
370
+ for task in dataset.info.tasks
371
+ if task.primitive == hafnia.dataset.primitives.classification.Classification
372
+ ]
373
+ has_classification_data = dataset_split[col_name].dtype != pl.List(pl.Null)
374
+ if has_classification_data:
375
+ classification_df = dataset_split.select(col_name).explode(col_name).unnest(col_name)
376
+
377
+ # Include only classification tasks that are defined in the dataset info
378
+ classification_df = classification_df.filter(
379
+ pl.col(FieldName.TASK_NAME).is_in(classification_tasks)
380
+ )
381
+
382
+ for (
383
+ task_name,
384
+ class_name,
385
+ ), class_group in classification_df.group_by(FieldName.TASK_NAME, FieldName.CLASS_NAME):
386
+ if class_name is None:
387
+ continue
388
+ if task_name == hafnia.dataset.primitives.classification.Classification.default_task_name():
389
+ display_name = class_name # Prefix class name with task name
390
+ else:
391
+ display_name = f"{task_name}.{class_name}"
392
+ object_reports.append(
393
+ DbAnnotatedObjectReport(
394
+ obj=DbAnnotatedObject(
395
+ name=display_name,
396
+ entity_type=EntityTypeChoices.EVENT.value,
397
+ ),
398
+ unique_obj_ids=len(
399
+ class_group
400
+ ), # Unique object IDs are not applicable for classification
401
+ obj_instances=len(class_group),
402
+ annotation_type=[annotation_type],
403
+ )
404
+ )
405
+
406
+ if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.segmentation.Segmentation):
407
+ raise NotImplementedError("Not Implemented yet")
408
+
409
+ if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.bitmask.Bitmask):
410
+ col_name = hafnia.dataset.primitives.bitmask.Bitmask.column_name()
411
+ drop_columns = [col for col in primitive_columns if col != col_name]
412
+ drop_columns.append(FieldName.META)
413
+ df_per_instance = dataset_split.rename({"height": "image.height", "width": "image.width"})
414
+ df_per_instance = df_per_instance.explode(col_name).drop(drop_columns).unnest(col_name)
415
+
416
+ min_area = df_per_instance["area"].min() if "area" in df_per_instance.columns else None
417
+ max_area = df_per_instance["area"].max() if "area" in df_per_instance.columns else None
418
+ avg_area = df_per_instance["area"].mean() if "area" in df_per_instance.columns else None
419
+
420
+ annotation_type = DbAnnotationType(name=AnnotationType.InstanceSegmentation)
421
+ for (class_name,), class_group in df_per_instance.group_by(FieldName.CLASS_NAME):
422
+ if class_name is None:
423
+ continue
424
+ object_reports.append(
425
+ DbAnnotatedObjectReport(
426
+ obj=DbAnnotatedObject(
427
+ name=class_name,
428
+ entity_type=EntityTypeChoices.OBJECT.value,
429
+ ),
430
+ unique_obj_ids=class_group[FieldName.OBJECT_ID].n_unique(),
431
+ obj_instances=len(class_group),
432
+ annotation_type=[annotation_type],
433
+ average_count_per_image=len(class_group) / class_group[ColumnName.SAMPLE_INDEX].n_unique(),
434
+ avg_area=avg_area,
435
+ min_area=min_area,
436
+ max_area=max_area,
437
+ )
438
+ )
439
+
440
+ if has_primitive(dataset_split, PrimitiveType=hafnia.dataset.primitives.polygon.Polygon):
441
+ raise NotImplementedError("Not Implemented yet")
442
+
443
+ # Sort object reports by name to more easily compare between versions
444
+ object_reports = sorted(object_reports, key=lambda x: x.obj.name) # Sort object reports by name
445
+ report.annotated_object_reports = object_reports
446
+
447
+ if report.distribution_values is None:
448
+ report.distribution_values = []
449
+
450
+ dataset_reports.append(report)
451
+ dataset_name = dataset.info.dataset_name
452
+ bucket_sample = generate_bucket_name(dataset_name, deployment_stage=deployment_stage)
453
+ dataset_info = DbDataset(
454
+ name=dataset_name,
455
+ version=dataset.info.version,
456
+ s3_bucket_name=bucket_sample,
457
+ dataset_variants=dataset_variants,
458
+ split_annotations_reports=dataset_reports,
459
+ license_citation=dataset_meta_info.get("license_citation", None),
460
+ data_captured_start=dataset_meta_info.get("data_captured_start", None),
461
+ data_captured_end=dataset_meta_info.get("data_captured_end", None),
462
+ data_received_start=dataset_meta_info.get("data_received_start", None),
463
+ data_received_end=dataset_meta_info.get("data_received_end", None),
464
+ annotation_project_id=dataset_meta_info.get("annotation_project_id", None),
465
+ annotation_dataset_id=dataset_meta_info.get("annotation_dataset_id", None),
466
+ )
467
+
468
+ return dataset_info