acryl-datahub 1.0.0.1rc1__py3-none-any.whl → 1.0.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/METADATA +2575 -2574
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/RECORD +77 -60
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +6 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +6 -11
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +16 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +62 -66
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/powerbi.py +29 -23
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +15 -6
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +309 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +22 -0
- datahub/sdk/search_filters.py +4 -4
- datahub/sql_parsing/split_statements.py +5 -1
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/ingestion/source/vertexai.py +0 -695
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Iterable, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from datahub._codegen.aspect import (
|
|
6
|
+
_Aspect, # TODO: is there a better import than this one?
|
|
7
|
+
)
|
|
8
|
+
from datahub.emitter.mce_builder import (
|
|
9
|
+
make_container_urn,
|
|
10
|
+
make_dashboard_urn,
|
|
11
|
+
make_data_platform_urn,
|
|
12
|
+
make_dataplatform_instance_urn,
|
|
13
|
+
make_tag_urn,
|
|
14
|
+
make_ts_millis,
|
|
15
|
+
make_user_urn,
|
|
16
|
+
)
|
|
17
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
|
+
from datahub.emitter.mcp_builder import ContainerKey
|
|
19
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
20
|
+
convert_dashboard_info_to_patch,
|
|
21
|
+
)
|
|
22
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
23
|
+
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
|
|
24
|
+
from datahub.ingestion.source.hex.constants import (
|
|
25
|
+
HEX_API_BASE_URL_DEFAULT,
|
|
26
|
+
HEX_PLATFORM_NAME,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.source.hex.model import (
|
|
29
|
+
Analytics,
|
|
30
|
+
Category,
|
|
31
|
+
Collection,
|
|
32
|
+
Component,
|
|
33
|
+
Owner,
|
|
34
|
+
Project,
|
|
35
|
+
Status,
|
|
36
|
+
)
|
|
37
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
38
|
+
AuditStampClass,
|
|
39
|
+
ChangeAuditStampsClass,
|
|
40
|
+
OwnershipType,
|
|
41
|
+
)
|
|
42
|
+
from datahub.metadata.schema_classes import (
|
|
43
|
+
CalendarIntervalClass,
|
|
44
|
+
ContainerClass,
|
|
45
|
+
ContainerPropertiesClass,
|
|
46
|
+
DashboardInfoClass,
|
|
47
|
+
DashboardUsageStatisticsClass,
|
|
48
|
+
DataPlatformInstanceClass,
|
|
49
|
+
GlobalTagsClass,
|
|
50
|
+
OwnerClass,
|
|
51
|
+
OwnershipClass,
|
|
52
|
+
SubTypesClass,
|
|
53
|
+
TagAssociationClass,
|
|
54
|
+
TimeWindowSizeClass,
|
|
55
|
+
)
|
|
56
|
+
from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
|
|
57
|
+
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class WorkspaceKey(ContainerKey):
|
|
62
|
+
workspace_name: str
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
DEFAULT_INGESTION_USER_URN = CorpUserUrn("_ingestion")
|
|
66
|
+
DEFAULT_OWNERSHIP_TYPE = OwnershipType.TECHNICAL_OWNER
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Mapper:
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
workspace_name: str,
|
|
73
|
+
platform_instance: Optional[str] = None,
|
|
74
|
+
env: Optional[str] = None,
|
|
75
|
+
base_url: str = HEX_API_BASE_URL_DEFAULT,
|
|
76
|
+
patch_metadata: bool = True,
|
|
77
|
+
collections_as_tags: bool = True,
|
|
78
|
+
status_as_tag: bool = True,
|
|
79
|
+
categories_as_tags: bool = True,
|
|
80
|
+
set_ownership_from_email: bool = True,
|
|
81
|
+
):
|
|
82
|
+
self._workspace_name = workspace_name
|
|
83
|
+
self._env = env
|
|
84
|
+
self._platform_instance = platform_instance
|
|
85
|
+
self._workspace_urn = Mapper._get_workspace_urn(
|
|
86
|
+
workspace_name=workspace_name,
|
|
87
|
+
platform=HEX_PLATFORM_NAME,
|
|
88
|
+
env=env,
|
|
89
|
+
platform_instance=platform_instance,
|
|
90
|
+
)
|
|
91
|
+
self._base_url = base_url.strip("/").replace("/api/v1", "")
|
|
92
|
+
self._patch_metadata = patch_metadata
|
|
93
|
+
self._collections_as_tags = collections_as_tags
|
|
94
|
+
self._status_as_tag = status_as_tag
|
|
95
|
+
self._categories_as_tags = categories_as_tags
|
|
96
|
+
self._set_ownership_from_email = set_ownership_from_email
|
|
97
|
+
|
|
98
|
+
def map_workspace(self) -> Iterable[MetadataWorkUnit]:
|
|
99
|
+
container_properties = ContainerPropertiesClass(
|
|
100
|
+
name=self._workspace_name,
|
|
101
|
+
env=self._env,
|
|
102
|
+
)
|
|
103
|
+
yield from self._yield_mcps(
|
|
104
|
+
entity_urn=self._workspace_urn,
|
|
105
|
+
aspects=[container_properties],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def map_project(self, project: Project) -> Iterable[MetadataWorkUnit]:
|
|
109
|
+
dashboard_urn = self._get_dashboard_urn(name=project.id)
|
|
110
|
+
|
|
111
|
+
dashboard_info = DashboardInfoClass(
|
|
112
|
+
title=project.title,
|
|
113
|
+
description=project.description or "",
|
|
114
|
+
lastModified=self._change_audit_stamps(
|
|
115
|
+
created_at=project.created_at, last_edited_at=project.last_edited_at
|
|
116
|
+
),
|
|
117
|
+
externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
|
|
118
|
+
customProperties=dict(id=project.id),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
subtypes = SubTypesClass(
|
|
122
|
+
typeNames=[BIAssetSubTypes.HEX_PROJECT],
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
platform_instance = self._platform_instance_aspect()
|
|
126
|
+
|
|
127
|
+
container = ContainerClass(
|
|
128
|
+
container=self._workspace_urn.urn(),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
tags = self._global_tags(
|
|
132
|
+
status=project.status,
|
|
133
|
+
categories=project.categories,
|
|
134
|
+
collections=project.collections,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
ownership = self._ownership(creator=project.creator, owner=project.owner)
|
|
138
|
+
|
|
139
|
+
usage_stats_all_time, usage_stats_last_7_days = (
|
|
140
|
+
self._dashboard_usage_statistics(analytics=project.analytics)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
yield from self._yield_mcps(
|
|
144
|
+
entity_urn=dashboard_urn,
|
|
145
|
+
aspects=[
|
|
146
|
+
dashboard_info,
|
|
147
|
+
subtypes,
|
|
148
|
+
platform_instance,
|
|
149
|
+
container,
|
|
150
|
+
tags,
|
|
151
|
+
ownership,
|
|
152
|
+
usage_stats_all_time,
|
|
153
|
+
usage_stats_last_7_days,
|
|
154
|
+
],
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def map_component(self, component: Component) -> Iterable[MetadataWorkUnit]:
|
|
158
|
+
dashboard_urn = self._get_dashboard_urn(name=component.id)
|
|
159
|
+
|
|
160
|
+
dashboard_info = DashboardInfoClass(
|
|
161
|
+
title=component.title,
|
|
162
|
+
description=component.description or "",
|
|
163
|
+
lastModified=self._change_audit_stamps(
|
|
164
|
+
created_at=component.created_at, last_edited_at=component.last_edited_at
|
|
165
|
+
),
|
|
166
|
+
externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{component.id}",
|
|
167
|
+
customProperties=dict(id=component.id),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
subtypes = SubTypesClass(
|
|
171
|
+
typeNames=[BIAssetSubTypes.HEX_COMPONENT],
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
platform_instance = self._platform_instance_aspect()
|
|
175
|
+
|
|
176
|
+
container = ContainerClass(
|
|
177
|
+
container=self._workspace_urn.urn(),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
tags = self._global_tags(
|
|
181
|
+
status=component.status,
|
|
182
|
+
categories=component.categories,
|
|
183
|
+
collections=component.collections,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
ownership = self._ownership(creator=component.creator, owner=component.owner)
|
|
187
|
+
|
|
188
|
+
usage_stats_all_time, usage_stats_last_7_days = (
|
|
189
|
+
self._dashboard_usage_statistics(analytics=component.analytics)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
yield from self._yield_mcps(
|
|
193
|
+
entity_urn=dashboard_urn,
|
|
194
|
+
aspects=[
|
|
195
|
+
dashboard_info,
|
|
196
|
+
subtypes,
|
|
197
|
+
platform_instance,
|
|
198
|
+
container,
|
|
199
|
+
tags,
|
|
200
|
+
ownership,
|
|
201
|
+
usage_stats_all_time,
|
|
202
|
+
usage_stats_last_7_days,
|
|
203
|
+
],
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def _get_workspace_urn(
|
|
208
|
+
cls,
|
|
209
|
+
workspace_name: str,
|
|
210
|
+
platform: str = HEX_PLATFORM_NAME,
|
|
211
|
+
env: Optional[str] = None,
|
|
212
|
+
platform_instance: Optional[str] = None,
|
|
213
|
+
) -> ContainerUrn:
|
|
214
|
+
workspace_key = WorkspaceKey(
|
|
215
|
+
platform=platform,
|
|
216
|
+
env=env,
|
|
217
|
+
platform_instance=platform_instance,
|
|
218
|
+
workspace_name=workspace_name,
|
|
219
|
+
)
|
|
220
|
+
container_urn_str = make_container_urn(guid=workspace_key)
|
|
221
|
+
container_urn = Urn.from_string(container_urn_str)
|
|
222
|
+
assert isinstance(container_urn, ContainerUrn)
|
|
223
|
+
return container_urn
|
|
224
|
+
|
|
225
|
+
def _get_dashboard_urn(self, name: str) -> DashboardUrn:
|
|
226
|
+
dashboard_urn_str = make_dashboard_urn(
|
|
227
|
+
platform=HEX_PLATFORM_NAME,
|
|
228
|
+
name=name,
|
|
229
|
+
platform_instance=self._platform_instance,
|
|
230
|
+
)
|
|
231
|
+
dashboard_urn = Urn.from_string(dashboard_urn_str)
|
|
232
|
+
assert isinstance(dashboard_urn, DashboardUrn)
|
|
233
|
+
return dashboard_urn
|
|
234
|
+
|
|
235
|
+
def _change_audit_stamps(
|
|
236
|
+
self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
|
|
237
|
+
) -> ChangeAuditStampsClass:
|
|
238
|
+
return ChangeAuditStampsClass(
|
|
239
|
+
created=AuditStampClass(
|
|
240
|
+
time=make_ts_millis(created_at),
|
|
241
|
+
actor=DEFAULT_INGESTION_USER_URN.urn(),
|
|
242
|
+
)
|
|
243
|
+
if created_at
|
|
244
|
+
else None,
|
|
245
|
+
lastModified=AuditStampClass(
|
|
246
|
+
time=make_ts_millis(last_edited_at),
|
|
247
|
+
actor=DEFAULT_INGESTION_USER_URN.urn(),
|
|
248
|
+
)
|
|
249
|
+
if last_edited_at
|
|
250
|
+
else None,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def _global_tags(
|
|
254
|
+
self,
|
|
255
|
+
status: Optional[Status],
|
|
256
|
+
categories: Optional[List[Category]],
|
|
257
|
+
collections: Optional[List[Collection]],
|
|
258
|
+
) -> Optional[GlobalTagsClass]:
|
|
259
|
+
tag_associations: List[TagAssociationClass] = []
|
|
260
|
+
if status and self._status_as_tag:
|
|
261
|
+
tag_associations.append(
|
|
262
|
+
TagAssociationClass(tag=make_tag_urn(tag=f"hex:status:{status.name}"))
|
|
263
|
+
)
|
|
264
|
+
if categories and self._categories_as_tags:
|
|
265
|
+
tag_associations.extend(
|
|
266
|
+
[
|
|
267
|
+
TagAssociationClass(
|
|
268
|
+
tag=make_tag_urn(tag=f"hex:category:{cat.name}")
|
|
269
|
+
)
|
|
270
|
+
for cat in categories
|
|
271
|
+
]
|
|
272
|
+
)
|
|
273
|
+
if collections and self._collections_as_tags:
|
|
274
|
+
tag_associations.extend(
|
|
275
|
+
[
|
|
276
|
+
TagAssociationClass(
|
|
277
|
+
tag=make_tag_urn(tag=f"hex:collection:{col.name}")
|
|
278
|
+
)
|
|
279
|
+
for col in collections
|
|
280
|
+
]
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return GlobalTagsClass(tags=tag_associations) if tag_associations else None
|
|
284
|
+
|
|
285
|
+
def _ownership(
|
|
286
|
+
self, creator: Optional[Owner], owner: Optional[Owner]
|
|
287
|
+
) -> Optional[OwnershipClass]:
|
|
288
|
+
if self._set_ownership_from_email:
|
|
289
|
+
# since we are not making any diff of creator/owner, we usually have duplicates
|
|
290
|
+
# TODO: set or ownership types to properly differentiate them, maybe by config?
|
|
291
|
+
unique_owners = set(o for o in [creator, owner] if o)
|
|
292
|
+
owners: List[OwnerClass] = [
|
|
293
|
+
OwnerClass(owner=make_user_urn(o.email), type=DEFAULT_OWNERSHIP_TYPE)
|
|
294
|
+
for o in unique_owners
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
return OwnershipClass(owners=owners) if owners else None
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
def _dashboard_usage_statistics(
|
|
301
|
+
self, analytics: Optional[Analytics]
|
|
302
|
+
) -> Tuple[
|
|
303
|
+
Optional[DashboardUsageStatisticsClass], Optional[DashboardUsageStatisticsClass]
|
|
304
|
+
]:
|
|
305
|
+
tm_millis = make_ts_millis(datetime.now())
|
|
306
|
+
last_viewed_at = (
|
|
307
|
+
make_ts_millis(analytics.last_viewed_at)
|
|
308
|
+
if analytics and analytics.last_viewed_at
|
|
309
|
+
else None
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
usage_all_time: Optional[DashboardUsageStatisticsClass] = (
|
|
313
|
+
DashboardUsageStatisticsClass(
|
|
314
|
+
timestampMillis=tm_millis,
|
|
315
|
+
viewsCount=analytics.appviews_all_time,
|
|
316
|
+
lastViewedAt=last_viewed_at,
|
|
317
|
+
)
|
|
318
|
+
if analytics and analytics.appviews_all_time
|
|
319
|
+
else None
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
usage_last_7_days: Optional[DashboardUsageStatisticsClass] = (
|
|
323
|
+
DashboardUsageStatisticsClass(
|
|
324
|
+
timestampMillis=tm_millis,
|
|
325
|
+
viewsCount=analytics.appviews_last_7_days,
|
|
326
|
+
eventGranularity=TimeWindowSizeClass(
|
|
327
|
+
unit=CalendarIntervalClass.WEEK, multiple=1
|
|
328
|
+
),
|
|
329
|
+
lastViewedAt=last_viewed_at,
|
|
330
|
+
)
|
|
331
|
+
if analytics and analytics.appviews_last_7_days
|
|
332
|
+
else None
|
|
333
|
+
)
|
|
334
|
+
return (usage_all_time, usage_last_7_days)
|
|
335
|
+
|
|
336
|
+
def _platform_instance_aspect(self) -> DataPlatformInstanceClass:
|
|
337
|
+
return DataPlatformInstanceClass(
|
|
338
|
+
platform=make_data_platform_urn(HEX_PLATFORM_NAME),
|
|
339
|
+
instance=make_dataplatform_instance_urn(
|
|
340
|
+
platform=HEX_PLATFORM_NAME, instance=self._platform_instance
|
|
341
|
+
)
|
|
342
|
+
if self._platform_instance
|
|
343
|
+
else None,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def _yield_mcps(
|
|
347
|
+
self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
|
|
348
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
349
|
+
for mcpw in MetadataChangeProposalWrapper.construct_many(
|
|
350
|
+
entityUrn=entity_urn.urn(),
|
|
351
|
+
aspects=aspects,
|
|
352
|
+
):
|
|
353
|
+
wu = MetadataWorkUnit.from_metadata(metadata=mcpw)
|
|
354
|
+
maybe_wu = self._maybe_patch_wu(wu)
|
|
355
|
+
if maybe_wu:
|
|
356
|
+
yield maybe_wu
|
|
357
|
+
|
|
358
|
+
def _maybe_patch_wu(self, wu: MetadataWorkUnit) -> Optional[MetadataWorkUnit]:
|
|
359
|
+
# So far we only have support for DashboardInfo aspect
|
|
360
|
+
|
|
361
|
+
dashboard_info_aspect: Optional[DashboardInfoClass] = wu.get_aspect_of_type(
|
|
362
|
+
DashboardInfoClass
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if dashboard_info_aspect and self._patch_metadata:
|
|
366
|
+
return convert_dashboard_info_to_patch(
|
|
367
|
+
wu.get_urn(),
|
|
368
|
+
dashboard_info_aspect,
|
|
369
|
+
wu.metadata.systemMetadata,
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
372
|
+
return wu
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Workspace:
|
|
8
|
+
name: str
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Status:
|
|
13
|
+
name: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Category:
|
|
18
|
+
name: str
|
|
19
|
+
description: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Collection:
|
|
24
|
+
name: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class Owner:
|
|
29
|
+
email: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Analytics:
|
|
34
|
+
appviews_all_time: Optional[int]
|
|
35
|
+
appviews_last_7_days: Optional[int]
|
|
36
|
+
appviews_last_14_days: Optional[int]
|
|
37
|
+
appviews_last_30_days: Optional[int]
|
|
38
|
+
last_viewed_at: Optional[datetime]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class Project:
|
|
43
|
+
id: str
|
|
44
|
+
title: str
|
|
45
|
+
description: Optional[str]
|
|
46
|
+
last_edited_at: Optional[datetime] = None
|
|
47
|
+
created_at: Optional[datetime] = None
|
|
48
|
+
status: Optional[Status] = None
|
|
49
|
+
categories: Optional[List[Category]] = None # TODO: emit category description!
|
|
50
|
+
collections: Optional[List[Collection]] = None
|
|
51
|
+
creator: Optional[Owner] = None
|
|
52
|
+
owner: Optional[Owner] = None
|
|
53
|
+
analytics: Optional[Analytics] = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class Component:
|
|
58
|
+
id: str
|
|
59
|
+
title: str
|
|
60
|
+
description: Optional[str]
|
|
61
|
+
last_edited_at: Optional[datetime] = None
|
|
62
|
+
created_at: Optional[datetime] = None
|
|
63
|
+
status: Optional[Status] = None
|
|
64
|
+
categories: Optional[List[Category]] = None
|
|
65
|
+
collections: Optional[List[Collection]] = None
|
|
66
|
+
creator: Optional[Owner] = None
|
|
67
|
+
owner: Optional[Owner] = None
|
|
68
|
+
analytics: Optional[Analytics] = None
|
|
@@ -69,8 +69,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
69
69
|
StatefulIngestionSourceBase,
|
|
70
70
|
)
|
|
71
71
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
72
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
73
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
74
72
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
75
73
|
OtherSchema,
|
|
76
74
|
SchemaField,
|
|
@@ -298,67 +296,35 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
298
296
|
self.config.platform_instance,
|
|
299
297
|
self.config.env,
|
|
300
298
|
)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
# Dataset properties aspect.
|
|
307
|
-
additional_properties = {}
|
|
308
|
-
custom_properties = table.metadata.properties.copy()
|
|
309
|
-
custom_properties["location"] = table.metadata.location
|
|
310
|
-
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
311
|
-
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
312
|
-
if table.current_snapshot():
|
|
313
|
-
custom_properties["snapshot-id"] = str(
|
|
314
|
-
table.current_snapshot().snapshot_id
|
|
315
|
-
)
|
|
316
|
-
custom_properties["manifest-list"] = (
|
|
317
|
-
table.current_snapshot().manifest_list
|
|
318
|
-
)
|
|
319
|
-
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
-
int(table.current_snapshot().timestamp_ms)
|
|
321
|
-
)
|
|
322
|
-
if "created-at" in custom_properties:
|
|
323
|
-
try:
|
|
324
|
-
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
-
additional_properties["created"] = TimeStampClass(
|
|
326
|
-
int(dt.timestamp() * 1000)
|
|
327
|
-
)
|
|
328
|
-
except Exception as ex:
|
|
329
|
-
LOGGER.warning(
|
|
330
|
-
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
-
)
|
|
299
|
+
yield MetadataChangeProposalWrapper(
|
|
300
|
+
entityUrn=dataset_urn, aspect=Status(removed=False)
|
|
301
|
+
).as_workunit()
|
|
332
302
|
|
|
333
|
-
dataset_properties =
|
|
334
|
-
|
|
335
|
-
description=table.metadata.properties.get("comment", None),
|
|
336
|
-
customProperties=custom_properties,
|
|
337
|
-
lastModified=additional_properties.get("lastModified"),
|
|
338
|
-
created=additional_properties.get("created"),
|
|
339
|
-
qualifiedName=dataset_name,
|
|
303
|
+
dataset_properties = self._get_dataset_properties_aspect(
|
|
304
|
+
dataset_name, table
|
|
340
305
|
)
|
|
341
|
-
|
|
342
|
-
|
|
306
|
+
yield MetadataChangeProposalWrapper(
|
|
307
|
+
entityUrn=dataset_urn, aspect=dataset_properties
|
|
308
|
+
).as_workunit()
|
|
309
|
+
|
|
343
310
|
dataset_ownership = self._get_ownership_aspect(table)
|
|
344
311
|
if dataset_ownership:
|
|
345
312
|
LOGGER.debug(
|
|
346
313
|
f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
|
|
347
314
|
)
|
|
348
|
-
|
|
315
|
+
yield MetadataChangeProposalWrapper(
|
|
316
|
+
entityUrn=dataset_urn, aspect=dataset_ownership
|
|
317
|
+
).as_workunit()
|
|
349
318
|
|
|
350
319
|
schema_metadata = self._create_schema_metadata(dataset_name, table)
|
|
351
|
-
|
|
320
|
+
yield MetadataChangeProposalWrapper(
|
|
321
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
322
|
+
).as_workunit()
|
|
323
|
+
yield self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
352
324
|
|
|
353
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
354
325
|
self.report.report_table_processing_time(
|
|
355
326
|
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
356
327
|
)
|
|
357
|
-
yield MetadataWorkUnit(id=dataset_name, mce=mce)
|
|
358
|
-
|
|
359
|
-
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
360
|
-
if dpi_aspect:
|
|
361
|
-
yield dpi_aspect
|
|
362
328
|
|
|
363
329
|
if self.config.is_profiling_enabled():
|
|
364
330
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
@@ -407,6 +373,40 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
407
373
|
)
|
|
408
374
|
return None
|
|
409
375
|
|
|
376
|
+
def _get_dataset_properties_aspect(
|
|
377
|
+
self, dataset_name: str, table: Table
|
|
378
|
+
) -> DatasetPropertiesClass:
|
|
379
|
+
additional_properties = {}
|
|
380
|
+
custom_properties = table.metadata.properties.copy()
|
|
381
|
+
custom_properties["location"] = table.metadata.location
|
|
382
|
+
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
383
|
+
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
384
|
+
if table.current_snapshot():
|
|
385
|
+
custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
|
|
386
|
+
custom_properties["manifest-list"] = table.current_snapshot().manifest_list
|
|
387
|
+
additional_properties["lastModified"] = TimeStampClass(
|
|
388
|
+
int(table.current_snapshot().timestamp_ms)
|
|
389
|
+
)
|
|
390
|
+
if "created-at" in custom_properties:
|
|
391
|
+
try:
|
|
392
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
393
|
+
additional_properties["created"] = TimeStampClass(
|
|
394
|
+
int(dt.timestamp() * 1000)
|
|
395
|
+
)
|
|
396
|
+
except Exception as ex:
|
|
397
|
+
LOGGER.warning(
|
|
398
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return DatasetPropertiesClass(
|
|
402
|
+
name=table.name()[-1],
|
|
403
|
+
description=table.metadata.properties.get("comment", None),
|
|
404
|
+
customProperties=custom_properties,
|
|
405
|
+
lastModified=additional_properties.get("lastModified"),
|
|
406
|
+
created=additional_properties.get("created"),
|
|
407
|
+
qualifiedName=dataset_name,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
410
|
def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
|
|
411
411
|
owners = []
|
|
412
412
|
if self.config.user_ownership_property:
|
|
@@ -435,22 +435,18 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
435
435
|
)
|
|
436
436
|
return OwnershipClass(owners=owners) if owners else None
|
|
437
437
|
|
|
438
|
-
def _get_dataplatform_instance_aspect(
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
),
|
|
451
|
-
).as_workunit()
|
|
452
|
-
|
|
453
|
-
return None
|
|
438
|
+
def _get_dataplatform_instance_aspect(self, dataset_urn: str) -> MetadataWorkUnit:
|
|
439
|
+
return MetadataChangeProposalWrapper(
|
|
440
|
+
entityUrn=dataset_urn,
|
|
441
|
+
aspect=DataPlatformInstanceClass(
|
|
442
|
+
platform=make_data_platform_urn(self.platform),
|
|
443
|
+
instance=make_dataplatform_instance_urn(
|
|
444
|
+
self.platform, self.config.platform_instance
|
|
445
|
+
)
|
|
446
|
+
if self.config.platform_instance
|
|
447
|
+
else None,
|
|
448
|
+
),
|
|
449
|
+
).as_workunit()
|
|
454
450
|
|
|
455
451
|
def _create_schema_metadata(
|
|
456
452
|
self, dataset_name: str, table: Table
|