acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/ingestion/api/source.py +6 -2
  9. datahub/ingestion/api/source_helpers.py +6 -2
  10. datahub/ingestion/extractor/schema_util.py +1 -0
  11. datahub/ingestion/source/common/data_platforms.py +23 -0
  12. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  13. datahub/ingestion/source/common/subtypes.py +15 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  15. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  16. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  17. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  18. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  19. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  20. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  21. datahub/ingestion/source/hex/__init__.py +0 -0
  22. datahub/ingestion/source/hex/api.py +394 -0
  23. datahub/ingestion/source/hex/constants.py +3 -0
  24. datahub/ingestion/source/hex/hex.py +167 -0
  25. datahub/ingestion/source/hex/mapper.py +372 -0
  26. datahub/ingestion/source/hex/model.py +68 -0
  27. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  28. datahub/ingestion/source/mlflow.py +198 -7
  29. datahub/ingestion/source/mode.py +11 -1
  30. datahub/ingestion/source/openapi.py +69 -34
  31. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  32. datahub/ingestion/source/s3/source.py +11 -0
  33. datahub/ingestion/source/slack/slack.py +399 -82
  34. datahub/ingestion/source/superset.py +138 -22
  35. datahub/ingestion/source/vertexai/__init__.py +0 -0
  36. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  37. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  38. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  39. datahub/metadata/_schema_classes.py +472 -1
  40. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  42. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  43. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  44. datahub/metadata/schema.avsc +311 -2
  45. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  46. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  47. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  48. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  49. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  50. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  51. datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
  52. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  53. datahub/metadata/schemas/Siblings.avsc +2 -0
  54. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  55. datahub/sdk/dataset.py +122 -0
  56. datahub/sdk/entity.py +99 -3
  57. datahub/sdk/entity_client.py +27 -3
  58. datahub/sdk/main_client.py +22 -0
  59. datahub/sdk/search_filters.py +4 -4
  60. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  61. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  62. datahub/testing/mcp_diff.py +1 -18
  63. datahub/ingestion/source/vertexai.py +0 -697
  64. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
  65. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,372 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from typing import Iterable, List, Optional, Tuple
4
+
5
+ from datahub._codegen.aspect import (
6
+ _Aspect, # TODO: is there a better import than this one?
7
+ )
8
+ from datahub.emitter.mce_builder import (
9
+ make_container_urn,
10
+ make_dashboard_urn,
11
+ make_data_platform_urn,
12
+ make_dataplatform_instance_urn,
13
+ make_tag_urn,
14
+ make_ts_millis,
15
+ make_user_urn,
16
+ )
17
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
+ from datahub.emitter.mcp_builder import ContainerKey
19
+ from datahub.ingestion.api.incremental_lineage_helper import (
20
+ convert_dashboard_info_to_patch,
21
+ )
22
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
23
+ from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
24
+ from datahub.ingestion.source.hex.constants import (
25
+ HEX_API_BASE_URL_DEFAULT,
26
+ HEX_PLATFORM_NAME,
27
+ )
28
+ from datahub.ingestion.source.hex.model import (
29
+ Analytics,
30
+ Category,
31
+ Collection,
32
+ Component,
33
+ Owner,
34
+ Project,
35
+ Status,
36
+ )
37
+ from datahub.metadata.com.linkedin.pegasus2avro.common import (
38
+ AuditStampClass,
39
+ ChangeAuditStampsClass,
40
+ OwnershipType,
41
+ )
42
+ from datahub.metadata.schema_classes import (
43
+ CalendarIntervalClass,
44
+ ContainerClass,
45
+ ContainerPropertiesClass,
46
+ DashboardInfoClass,
47
+ DashboardUsageStatisticsClass,
48
+ DataPlatformInstanceClass,
49
+ GlobalTagsClass,
50
+ OwnerClass,
51
+ OwnershipClass,
52
+ SubTypesClass,
53
+ TagAssociationClass,
54
+ TimeWindowSizeClass,
55
+ )
56
+ from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ class WorkspaceKey(ContainerKey):
62
+ workspace_name: str
63
+
64
+
65
+ DEFAULT_INGESTION_USER_URN = CorpUserUrn("_ingestion")
66
+ DEFAULT_OWNERSHIP_TYPE = OwnershipType.TECHNICAL_OWNER
67
+
68
+
69
+ class Mapper:
70
+ def __init__(
71
+ self,
72
+ workspace_name: str,
73
+ platform_instance: Optional[str] = None,
74
+ env: Optional[str] = None,
75
+ base_url: str = HEX_API_BASE_URL_DEFAULT,
76
+ patch_metadata: bool = True,
77
+ collections_as_tags: bool = True,
78
+ status_as_tag: bool = True,
79
+ categories_as_tags: bool = True,
80
+ set_ownership_from_email: bool = True,
81
+ ):
82
+ self._workspace_name = workspace_name
83
+ self._env = env
84
+ self._platform_instance = platform_instance
85
+ self._workspace_urn = Mapper._get_workspace_urn(
86
+ workspace_name=workspace_name,
87
+ platform=HEX_PLATFORM_NAME,
88
+ env=env,
89
+ platform_instance=platform_instance,
90
+ )
91
+ self._base_url = base_url.strip("/").replace("/api/v1", "")
92
+ self._patch_metadata = patch_metadata
93
+ self._collections_as_tags = collections_as_tags
94
+ self._status_as_tag = status_as_tag
95
+ self._categories_as_tags = categories_as_tags
96
+ self._set_ownership_from_email = set_ownership_from_email
97
+
98
+ def map_workspace(self) -> Iterable[MetadataWorkUnit]:
99
+ container_properties = ContainerPropertiesClass(
100
+ name=self._workspace_name,
101
+ env=self._env,
102
+ )
103
+ yield from self._yield_mcps(
104
+ entity_urn=self._workspace_urn,
105
+ aspects=[container_properties],
106
+ )
107
+
108
+ def map_project(self, project: Project) -> Iterable[MetadataWorkUnit]:
109
+ dashboard_urn = self._get_dashboard_urn(name=project.id)
110
+
111
+ dashboard_info = DashboardInfoClass(
112
+ title=project.title,
113
+ description=project.description or "",
114
+ lastModified=self._change_audit_stamps(
115
+ created_at=project.created_at, last_edited_at=project.last_edited_at
116
+ ),
117
+ externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
118
+ customProperties=dict(id=project.id),
119
+ )
120
+
121
+ subtypes = SubTypesClass(
122
+ typeNames=[BIAssetSubTypes.HEX_PROJECT],
123
+ )
124
+
125
+ platform_instance = self._platform_instance_aspect()
126
+
127
+ container = ContainerClass(
128
+ container=self._workspace_urn.urn(),
129
+ )
130
+
131
+ tags = self._global_tags(
132
+ status=project.status,
133
+ categories=project.categories,
134
+ collections=project.collections,
135
+ )
136
+
137
+ ownership = self._ownership(creator=project.creator, owner=project.owner)
138
+
139
+ usage_stats_all_time, usage_stats_last_7_days = (
140
+ self._dashboard_usage_statistics(analytics=project.analytics)
141
+ )
142
+
143
+ yield from self._yield_mcps(
144
+ entity_urn=dashboard_urn,
145
+ aspects=[
146
+ dashboard_info,
147
+ subtypes,
148
+ platform_instance,
149
+ container,
150
+ tags,
151
+ ownership,
152
+ usage_stats_all_time,
153
+ usage_stats_last_7_days,
154
+ ],
155
+ )
156
+
157
+ def map_component(self, component: Component) -> Iterable[MetadataWorkUnit]:
158
+ dashboard_urn = self._get_dashboard_urn(name=component.id)
159
+
160
+ dashboard_info = DashboardInfoClass(
161
+ title=component.title,
162
+ description=component.description or "",
163
+ lastModified=self._change_audit_stamps(
164
+ created_at=component.created_at, last_edited_at=component.last_edited_at
165
+ ),
166
+ externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{component.id}",
167
+ customProperties=dict(id=component.id),
168
+ )
169
+
170
+ subtypes = SubTypesClass(
171
+ typeNames=[BIAssetSubTypes.HEX_COMPONENT],
172
+ )
173
+
174
+ platform_instance = self._platform_instance_aspect()
175
+
176
+ container = ContainerClass(
177
+ container=self._workspace_urn.urn(),
178
+ )
179
+
180
+ tags = self._global_tags(
181
+ status=component.status,
182
+ categories=component.categories,
183
+ collections=component.collections,
184
+ )
185
+
186
+ ownership = self._ownership(creator=component.creator, owner=component.owner)
187
+
188
+ usage_stats_all_time, usage_stats_last_7_days = (
189
+ self._dashboard_usage_statistics(analytics=component.analytics)
190
+ )
191
+
192
+ yield from self._yield_mcps(
193
+ entity_urn=dashboard_urn,
194
+ aspects=[
195
+ dashboard_info,
196
+ subtypes,
197
+ platform_instance,
198
+ container,
199
+ tags,
200
+ ownership,
201
+ usage_stats_all_time,
202
+ usage_stats_last_7_days,
203
+ ],
204
+ )
205
+
206
+ @classmethod
207
+ def _get_workspace_urn(
208
+ cls,
209
+ workspace_name: str,
210
+ platform: str = HEX_PLATFORM_NAME,
211
+ env: Optional[str] = None,
212
+ platform_instance: Optional[str] = None,
213
+ ) -> ContainerUrn:
214
+ workspace_key = WorkspaceKey(
215
+ platform=platform,
216
+ env=env,
217
+ platform_instance=platform_instance,
218
+ workspace_name=workspace_name,
219
+ )
220
+ container_urn_str = make_container_urn(guid=workspace_key)
221
+ container_urn = Urn.from_string(container_urn_str)
222
+ assert isinstance(container_urn, ContainerUrn)
223
+ return container_urn
224
+
225
+ def _get_dashboard_urn(self, name: str) -> DashboardUrn:
226
+ dashboard_urn_str = make_dashboard_urn(
227
+ platform=HEX_PLATFORM_NAME,
228
+ name=name,
229
+ platform_instance=self._platform_instance,
230
+ )
231
+ dashboard_urn = Urn.from_string(dashboard_urn_str)
232
+ assert isinstance(dashboard_urn, DashboardUrn)
233
+ return dashboard_urn
234
+
235
+ def _change_audit_stamps(
236
+ self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
237
+ ) -> ChangeAuditStampsClass:
238
+ return ChangeAuditStampsClass(
239
+ created=AuditStampClass(
240
+ time=make_ts_millis(created_at),
241
+ actor=DEFAULT_INGESTION_USER_URN.urn(),
242
+ )
243
+ if created_at
244
+ else None,
245
+ lastModified=AuditStampClass(
246
+ time=make_ts_millis(last_edited_at),
247
+ actor=DEFAULT_INGESTION_USER_URN.urn(),
248
+ )
249
+ if last_edited_at
250
+ else None,
251
+ )
252
+
253
+ def _global_tags(
254
+ self,
255
+ status: Optional[Status],
256
+ categories: Optional[List[Category]],
257
+ collections: Optional[List[Collection]],
258
+ ) -> Optional[GlobalTagsClass]:
259
+ tag_associations: List[TagAssociationClass] = []
260
+ if status and self._status_as_tag:
261
+ tag_associations.append(
262
+ TagAssociationClass(tag=make_tag_urn(tag=f"hex:status:{status.name}"))
263
+ )
264
+ if categories and self._categories_as_tags:
265
+ tag_associations.extend(
266
+ [
267
+ TagAssociationClass(
268
+ tag=make_tag_urn(tag=f"hex:category:{cat.name}")
269
+ )
270
+ for cat in categories
271
+ ]
272
+ )
273
+ if collections and self._collections_as_tags:
274
+ tag_associations.extend(
275
+ [
276
+ TagAssociationClass(
277
+ tag=make_tag_urn(tag=f"hex:collection:{col.name}")
278
+ )
279
+ for col in collections
280
+ ]
281
+ )
282
+
283
+ return GlobalTagsClass(tags=tag_associations) if tag_associations else None
284
+
285
+ def _ownership(
286
+ self, creator: Optional[Owner], owner: Optional[Owner]
287
+ ) -> Optional[OwnershipClass]:
288
+ if self._set_ownership_from_email:
289
+ # since we are not making any diff of creator/owner, we usually have duplicates
290
+ # TODO: set or ownership types to properly differentiate them, maybe by config?
291
+ unique_owners = set(o for o in [creator, owner] if o)
292
+ owners: List[OwnerClass] = [
293
+ OwnerClass(owner=make_user_urn(o.email), type=DEFAULT_OWNERSHIP_TYPE)
294
+ for o in unique_owners
295
+ ]
296
+
297
+ return OwnershipClass(owners=owners) if owners else None
298
+ return None
299
+
300
+ def _dashboard_usage_statistics(
301
+ self, analytics: Optional[Analytics]
302
+ ) -> Tuple[
303
+ Optional[DashboardUsageStatisticsClass], Optional[DashboardUsageStatisticsClass]
304
+ ]:
305
+ tm_millis = make_ts_millis(datetime.now())
306
+ last_viewed_at = (
307
+ make_ts_millis(analytics.last_viewed_at)
308
+ if analytics and analytics.last_viewed_at
309
+ else None
310
+ )
311
+
312
+ usage_all_time: Optional[DashboardUsageStatisticsClass] = (
313
+ DashboardUsageStatisticsClass(
314
+ timestampMillis=tm_millis,
315
+ viewsCount=analytics.appviews_all_time,
316
+ lastViewedAt=last_viewed_at,
317
+ )
318
+ if analytics and analytics.appviews_all_time
319
+ else None
320
+ )
321
+
322
+ usage_last_7_days: Optional[DashboardUsageStatisticsClass] = (
323
+ DashboardUsageStatisticsClass(
324
+ timestampMillis=tm_millis,
325
+ viewsCount=analytics.appviews_last_7_days,
326
+ eventGranularity=TimeWindowSizeClass(
327
+ unit=CalendarIntervalClass.WEEK, multiple=1
328
+ ),
329
+ lastViewedAt=last_viewed_at,
330
+ )
331
+ if analytics and analytics.appviews_last_7_days
332
+ else None
333
+ )
334
+ return (usage_all_time, usage_last_7_days)
335
+
336
+ def _platform_instance_aspect(self) -> DataPlatformInstanceClass:
337
+ return DataPlatformInstanceClass(
338
+ platform=make_data_platform_urn(HEX_PLATFORM_NAME),
339
+ instance=make_dataplatform_instance_urn(
340
+ platform=HEX_PLATFORM_NAME, instance=self._platform_instance
341
+ )
342
+ if self._platform_instance
343
+ else None,
344
+ )
345
+
346
+ def _yield_mcps(
347
+ self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
348
+ ) -> Iterable[MetadataWorkUnit]:
349
+ for mcpw in MetadataChangeProposalWrapper.construct_many(
350
+ entityUrn=entity_urn.urn(),
351
+ aspects=aspects,
352
+ ):
353
+ wu = MetadataWorkUnit.from_metadata(metadata=mcpw)
354
+ maybe_wu = self._maybe_patch_wu(wu)
355
+ if maybe_wu:
356
+ yield maybe_wu
357
+
358
+ def _maybe_patch_wu(self, wu: MetadataWorkUnit) -> Optional[MetadataWorkUnit]:
359
+ # So far we only have support for DashboardInfo aspect
360
+
361
+ dashboard_info_aspect: Optional[DashboardInfoClass] = wu.get_aspect_of_type(
362
+ DashboardInfoClass
363
+ )
364
+
365
+ if dashboard_info_aspect and self._patch_metadata:
366
+ return convert_dashboard_info_to_patch(
367
+ wu.get_urn(),
368
+ dashboard_info_aspect,
369
+ wu.metadata.systemMetadata,
370
+ )
371
+ else:
372
+ return wu
@@ -0,0 +1,68 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import List, Optional
4
+
5
+
6
+ @dataclass
7
+ class Workspace:
8
+ name: str
9
+
10
+
11
+ @dataclass
12
+ class Status:
13
+ name: str
14
+
15
+
16
+ @dataclass
17
+ class Category:
18
+ name: str
19
+ description: Optional[str] = None
20
+
21
+
22
+ @dataclass
23
+ class Collection:
24
+ name: str
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class Owner:
29
+ email: str
30
+
31
+
32
+ @dataclass
33
+ class Analytics:
34
+ appviews_all_time: Optional[int]
35
+ appviews_last_7_days: Optional[int]
36
+ appviews_last_14_days: Optional[int]
37
+ appviews_last_30_days: Optional[int]
38
+ last_viewed_at: Optional[datetime]
39
+
40
+
41
+ @dataclass
42
+ class Project:
43
+ id: str
44
+ title: str
45
+ description: Optional[str]
46
+ last_edited_at: Optional[datetime] = None
47
+ created_at: Optional[datetime] = None
48
+ status: Optional[Status] = None
49
+ categories: Optional[List[Category]] = None # TODO: emit category description!
50
+ collections: Optional[List[Collection]] = None
51
+ creator: Optional[Owner] = None
52
+ owner: Optional[Owner] = None
53
+ analytics: Optional[Analytics] = None
54
+
55
+
56
+ @dataclass
57
+ class Component:
58
+ id: str
59
+ title: str
60
+ description: Optional[str]
61
+ last_edited_at: Optional[datetime] = None
62
+ created_at: Optional[datetime] = None
63
+ status: Optional[Status] = None
64
+ categories: Optional[List[Category]] = None
65
+ collections: Optional[List[Collection]] = None
66
+ creator: Optional[Owner] = None
67
+ owner: Optional[Owner] = None
68
+ analytics: Optional[Analytics] = None
@@ -69,8 +69,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
69
69
  StatefulIngestionSourceBase,
70
70
  )
71
71
  from datahub.metadata.com.linkedin.pegasus2avro.common import Status
72
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
73
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
74
72
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
73
  OtherSchema,
76
74
  SchemaField,
@@ -298,67 +296,35 @@ class IcebergSource(StatefulIngestionSourceBase):
298
296
  self.config.platform_instance,
299
297
  self.config.env,
300
298
  )
301
- dataset_snapshot = DatasetSnapshot(
302
- urn=dataset_urn,
303
- aspects=[Status(removed=False)],
304
- )
305
-
306
- # Dataset properties aspect.
307
- additional_properties = {}
308
- custom_properties = table.metadata.properties.copy()
309
- custom_properties["location"] = table.metadata.location
310
- custom_properties["format-version"] = str(table.metadata.format_version)
311
- custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
312
- if table.current_snapshot():
313
- custom_properties["snapshot-id"] = str(
314
- table.current_snapshot().snapshot_id
315
- )
316
- custom_properties["manifest-list"] = (
317
- table.current_snapshot().manifest_list
318
- )
319
- additional_properties["lastModified"] = TimeStampClass(
320
- int(table.current_snapshot().timestamp_ms)
321
- )
322
- if "created-at" in custom_properties:
323
- try:
324
- dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
- additional_properties["created"] = TimeStampClass(
326
- int(dt.timestamp() * 1000)
327
- )
328
- except Exception as ex:
329
- LOGGER.warning(
330
- f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
- )
299
+ yield MetadataChangeProposalWrapper(
300
+ entityUrn=dataset_urn, aspect=Status(removed=False)
301
+ ).as_workunit()
332
302
 
333
- dataset_properties = DatasetPropertiesClass(
334
- name=table.name()[-1],
335
- description=table.metadata.properties.get("comment", None),
336
- customProperties=custom_properties,
337
- lastModified=additional_properties.get("lastModified"),
338
- created=additional_properties.get("created"),
339
- qualifiedName=dataset_name,
303
+ dataset_properties = self._get_dataset_properties_aspect(
304
+ dataset_name, table
340
305
  )
341
- dataset_snapshot.aspects.append(dataset_properties)
342
- # Dataset ownership aspect.
306
+ yield MetadataChangeProposalWrapper(
307
+ entityUrn=dataset_urn, aspect=dataset_properties
308
+ ).as_workunit()
309
+
343
310
  dataset_ownership = self._get_ownership_aspect(table)
344
311
  if dataset_ownership:
345
312
  LOGGER.debug(
346
313
  f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
347
314
  )
348
- dataset_snapshot.aspects.append(dataset_ownership)
315
+ yield MetadataChangeProposalWrapper(
316
+ entityUrn=dataset_urn, aspect=dataset_ownership
317
+ ).as_workunit()
349
318
 
350
319
  schema_metadata = self._create_schema_metadata(dataset_name, table)
351
- dataset_snapshot.aspects.append(schema_metadata)
320
+ yield MetadataChangeProposalWrapper(
321
+ entityUrn=dataset_urn, aspect=schema_metadata
322
+ ).as_workunit()
323
+ yield self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
352
324
 
353
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
354
325
  self.report.report_table_processing_time(
355
326
  timer.elapsed_seconds(), dataset_name, table.metadata_location
356
327
  )
357
- yield MetadataWorkUnit(id=dataset_name, mce=mce)
358
-
359
- dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
360
- if dpi_aspect:
361
- yield dpi_aspect
362
328
 
363
329
  if self.config.is_profiling_enabled():
364
330
  profiler = IcebergProfiler(self.report, self.config.profiling)
@@ -407,6 +373,40 @@ class IcebergSource(StatefulIngestionSourceBase):
407
373
  )
408
374
  return None
409
375
 
376
+ def _get_dataset_properties_aspect(
377
+ self, dataset_name: str, table: Table
378
+ ) -> DatasetPropertiesClass:
379
+ additional_properties = {}
380
+ custom_properties = table.metadata.properties.copy()
381
+ custom_properties["location"] = table.metadata.location
382
+ custom_properties["format-version"] = str(table.metadata.format_version)
383
+ custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
384
+ if table.current_snapshot():
385
+ custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
386
+ custom_properties["manifest-list"] = table.current_snapshot().manifest_list
387
+ additional_properties["lastModified"] = TimeStampClass(
388
+ int(table.current_snapshot().timestamp_ms)
389
+ )
390
+ if "created-at" in custom_properties:
391
+ try:
392
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
393
+ additional_properties["created"] = TimeStampClass(
394
+ int(dt.timestamp() * 1000)
395
+ )
396
+ except Exception as ex:
397
+ LOGGER.warning(
398
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
399
+ )
400
+
401
+ return DatasetPropertiesClass(
402
+ name=table.name()[-1],
403
+ description=table.metadata.properties.get("comment", None),
404
+ customProperties=custom_properties,
405
+ lastModified=additional_properties.get("lastModified"),
406
+ created=additional_properties.get("created"),
407
+ qualifiedName=dataset_name,
408
+ )
409
+
410
410
  def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
411
411
  owners = []
412
412
  if self.config.user_ownership_property:
@@ -435,22 +435,18 @@ class IcebergSource(StatefulIngestionSourceBase):
435
435
  )
436
436
  return OwnershipClass(owners=owners) if owners else None
437
437
 
438
- def _get_dataplatform_instance_aspect(
439
- self, dataset_urn: str
440
- ) -> Optional[MetadataWorkUnit]:
441
- # If we are a platform instance based source, emit the instance aspect
442
- if self.config.platform_instance:
443
- return MetadataChangeProposalWrapper(
444
- entityUrn=dataset_urn,
445
- aspect=DataPlatformInstanceClass(
446
- platform=make_data_platform_urn(self.platform),
447
- instance=make_dataplatform_instance_urn(
448
- self.platform, self.config.platform_instance
449
- ),
450
- ),
451
- ).as_workunit()
452
-
453
- return None
438
+ def _get_dataplatform_instance_aspect(self, dataset_urn: str) -> MetadataWorkUnit:
439
+ return MetadataChangeProposalWrapper(
440
+ entityUrn=dataset_urn,
441
+ aspect=DataPlatformInstanceClass(
442
+ platform=make_data_platform_urn(self.platform),
443
+ instance=make_dataplatform_instance_urn(
444
+ self.platform, self.config.platform_instance
445
+ )
446
+ if self.config.platform_instance
447
+ else None,
448
+ ),
449
+ ).as_workunit()
454
450
 
455
451
  def _create_schema_metadata(
456
452
  self, dataset_name: str, table: Table