acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/METADATA +2410 -2410
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/RECORD +38 -36
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.aws.platform_resource_repository import (
|
|
6
|
+
GluePlatformResourceRepository,
|
|
7
|
+
)
|
|
3
8
|
|
|
4
9
|
from pydantic import BaseModel
|
|
5
10
|
|
|
@@ -7,7 +12,6 @@ from datahub.api.entities.external.external_entities import (
|
|
|
7
12
|
ExternalEntity,
|
|
8
13
|
ExternalEntityId,
|
|
9
14
|
LinkedResourceSet,
|
|
10
|
-
PlatformResourceRepository,
|
|
11
15
|
)
|
|
12
16
|
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
13
17
|
LakeFormationTag,
|
|
@@ -15,10 +19,8 @@ from datahub.api.entities.external.lake_formation_external_entites import (
|
|
|
15
19
|
from datahub.api.entities.platformresource.platform_resource import (
|
|
16
20
|
PlatformResource,
|
|
17
21
|
PlatformResourceKey,
|
|
18
|
-
PlatformResourceSearchFields,
|
|
19
22
|
)
|
|
20
23
|
from datahub.metadata.urns import TagUrn
|
|
21
|
-
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
22
24
|
from datahub.utilities.urns.urn import Urn
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
@@ -29,8 +31,12 @@ class LakeFormationTagSyncContext(BaseModel):
|
|
|
29
31
|
platform_instance: Optional[str] = None
|
|
30
32
|
catalog: Optional[str] = None
|
|
31
33
|
|
|
34
|
+
# Making it compatible with SyncContext interface
|
|
35
|
+
def get_platform_instance(self) -> Optional[str]:
|
|
36
|
+
return self.platform_instance
|
|
37
|
+
|
|
32
38
|
|
|
33
|
-
class LakeFormationTagPlatformResourceId(
|
|
39
|
+
class LakeFormationTagPlatformResourceId(ExternalEntityId):
|
|
34
40
|
"""
|
|
35
41
|
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
36
42
|
"""
|
|
@@ -42,9 +48,6 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
42
48
|
exists_in_lake_formation: bool = False
|
|
43
49
|
persisted: bool = False
|
|
44
50
|
|
|
45
|
-
def __hash__(self) -> int:
|
|
46
|
-
return hash(self.to_platform_resource_key().id)
|
|
47
|
-
|
|
48
51
|
# this is a hack to make sure the property is a string and not private pydantic field
|
|
49
52
|
@staticmethod
|
|
50
53
|
def _RESOURCE_TYPE() -> str:
|
|
@@ -61,24 +64,26 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
61
64
|
)
|
|
62
65
|
|
|
63
66
|
@classmethod
|
|
64
|
-
def
|
|
67
|
+
def get_or_create_from_tag(
|
|
65
68
|
cls,
|
|
66
69
|
tag: LakeFormationTag,
|
|
67
|
-
|
|
68
|
-
platform_resource_repository: PlatformResourceRepository,
|
|
69
|
-
catalog: Optional[str] = None,
|
|
70
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
70
71
|
exists_in_lake_formation: bool = False,
|
|
72
|
+
catalog_id: Optional[str] = None,
|
|
71
73
|
) -> "LakeFormationTagPlatformResourceId":
|
|
72
74
|
"""
|
|
73
75
|
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
74
76
|
"""
|
|
75
77
|
|
|
78
|
+
# Use catalog_id if provided, otherwise fall back to repository catalog
|
|
79
|
+
effective_catalog = catalog_id or platform_resource_repository.catalog
|
|
80
|
+
|
|
76
81
|
existing_platform_resource = cls.search_by_urn(
|
|
77
82
|
tag.to_datahub_tag_urn().urn(),
|
|
78
83
|
platform_resource_repository=platform_resource_repository,
|
|
79
84
|
tag_sync_context=LakeFormationTagSyncContext(
|
|
80
|
-
platform_instance=platform_instance,
|
|
81
|
-
catalog=
|
|
85
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
86
|
+
catalog=effective_catalog,
|
|
82
87
|
),
|
|
83
88
|
)
|
|
84
89
|
if existing_platform_resource:
|
|
@@ -90,9 +95,9 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
90
95
|
return LakeFormationTagPlatformResourceId(
|
|
91
96
|
tag_key=str(tag.key),
|
|
92
97
|
tag_value=str(tag.value) if tag.value is not None else None,
|
|
93
|
-
platform_instance=platform_instance,
|
|
98
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
99
|
+
catalog=effective_catalog,
|
|
94
100
|
exists_in_lake_formation=exists_in_lake_formation,
|
|
95
|
-
catalog=catalog,
|
|
96
101
|
persisted=False,
|
|
97
102
|
)
|
|
98
103
|
|
|
@@ -100,64 +105,48 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
100
105
|
def search_by_urn(
|
|
101
106
|
cls,
|
|
102
107
|
urn: str,
|
|
103
|
-
platform_resource_repository:
|
|
108
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
104
109
|
tag_sync_context: LakeFormationTagSyncContext,
|
|
105
110
|
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
111
|
+
"""
|
|
112
|
+
Search for existing Lake Formation tag entity by URN using repository caching.
|
|
113
|
+
|
|
114
|
+
This method now delegates to the repository's search_entity_by_urn method to ensure
|
|
115
|
+
consistent caching behavior across all platform implementations.
|
|
116
|
+
"""
|
|
117
|
+
# Use repository's cached search method instead of duplicating search logic
|
|
118
|
+
existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
|
|
119
|
+
|
|
120
|
+
if existing_entity_id:
|
|
121
|
+
# Verify platform instance and catalog match
|
|
122
|
+
if (
|
|
123
|
+
existing_entity_id.platform_instance
|
|
124
|
+
== tag_sync_context.platform_instance
|
|
125
|
+
and existing_entity_id.catalog == tag_sync_context.catalog
|
|
126
|
+
):
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
|
|
115
129
|
)
|
|
116
|
-
|
|
117
|
-
|
|
130
|
+
# Create a new ID with the correct state instead of mutating
|
|
131
|
+
return LakeFormationTagPlatformResourceId(
|
|
132
|
+
tag_key=existing_entity_id.tag_key,
|
|
133
|
+
tag_value=existing_entity_id.tag_value,
|
|
134
|
+
platform_instance=existing_entity_id.platform_instance,
|
|
135
|
+
catalog=existing_entity_id.catalog,
|
|
136
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
137
|
+
persisted=True, # And it's persisted in DataHub
|
|
138
|
+
)
|
|
139
|
+
|
|
118
140
|
logger.info(
|
|
119
|
-
f"
|
|
141
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
120
142
|
)
|
|
121
|
-
if len(mapped_tags) > 0:
|
|
122
|
-
for platform_resource in mapped_tags:
|
|
123
|
-
if (
|
|
124
|
-
platform_resource.resource_info
|
|
125
|
-
and platform_resource.resource_info.value
|
|
126
|
-
):
|
|
127
|
-
lake_formation_tag_platform_resource = (
|
|
128
|
-
LakeFormationTagPlatformResource(
|
|
129
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
130
|
-
LakeFormationTagPlatformResource
|
|
131
|
-
).dict()
|
|
132
|
-
)
|
|
133
|
-
)
|
|
134
|
-
if (
|
|
135
|
-
lake_formation_tag_platform_resource.id.platform_instance
|
|
136
|
-
== tag_sync_context.platform_instance
|
|
137
|
-
and lake_formation_tag_platform_resource.id.catalog
|
|
138
|
-
== tag_sync_context.catalog
|
|
139
|
-
):
|
|
140
|
-
lake_formation_tag_id = lake_formation_tag_platform_resource.id
|
|
141
|
-
lake_formation_tag_id.exists_in_lake_formation = True
|
|
142
|
-
lake_formation_tag_id.persisted = True
|
|
143
|
-
return lake_formation_tag_id
|
|
144
|
-
else:
|
|
145
|
-
logger.warning(
|
|
146
|
-
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
147
|
-
)
|
|
148
|
-
continue
|
|
149
|
-
|
|
150
|
-
# If we reach here, it means we did not find a mapped tag for the URN
|
|
151
|
-
logger.info(
|
|
152
|
-
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
153
|
-
)
|
|
154
143
|
return None
|
|
155
144
|
|
|
156
145
|
@classmethod
|
|
157
146
|
def from_datahub_urn(
|
|
158
147
|
cls,
|
|
159
148
|
urn: str,
|
|
160
|
-
platform_resource_repository:
|
|
149
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
161
150
|
tag_sync_context: LakeFormationTagSyncContext,
|
|
162
151
|
) -> "LakeFormationTagPlatformResourceId":
|
|
163
152
|
"""
|
|
@@ -188,11 +177,17 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
188
177
|
logger.info(
|
|
189
178
|
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
190
179
|
)
|
|
191
|
-
|
|
192
|
-
|
|
180
|
+
# Create a new ID with the correct state instead of mutating
|
|
181
|
+
return LakeFormationTagPlatformResourceId(
|
|
182
|
+
tag_key=new_tag_id.tag_key,
|
|
183
|
+
tag_value=new_tag_id.tag_value,
|
|
184
|
+
platform_instance=new_tag_id.platform_instance,
|
|
185
|
+
catalog=new_tag_id.catalog,
|
|
186
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
187
|
+
persisted=new_tag_id.persisted,
|
|
193
188
|
)
|
|
194
189
|
return new_tag_id
|
|
195
|
-
raise ValueError(f"Unable to create
|
|
190
|
+
raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
|
|
196
191
|
|
|
197
192
|
@classmethod
|
|
198
193
|
def generate_tag_id(
|
|
@@ -223,7 +218,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
223
218
|
)
|
|
224
219
|
|
|
225
220
|
|
|
226
|
-
class LakeFormationTagPlatformResource(
|
|
221
|
+
class LakeFormationTagPlatformResource(ExternalEntity):
|
|
227
222
|
datahub_urns: LinkedResourceSet
|
|
228
223
|
managed_by_datahub: bool
|
|
229
224
|
id: LakeFormationTagPlatformResourceId
|
|
@@ -246,46 +241,29 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
|
|
|
246
241
|
)
|
|
247
242
|
|
|
248
243
|
@classmethod
|
|
249
|
-
def
|
|
244
|
+
def create_default(
|
|
250
245
|
cls,
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
managed_by_datahub: bool = False,
|
|
246
|
+
entity_id: ExternalEntityId,
|
|
247
|
+
managed_by_datahub: bool,
|
|
254
248
|
) -> "LakeFormationTagPlatformResource":
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
for platform_resource in platform_resources:
|
|
272
|
-
if (
|
|
273
|
-
platform_resource.resource_info
|
|
274
|
-
and platform_resource.resource_info.value
|
|
275
|
-
):
|
|
276
|
-
lf_tag = LakeFormationTagPlatformResource(
|
|
277
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
278
|
-
LakeFormationTagPlatformResource
|
|
279
|
-
).dict()
|
|
280
|
-
)
|
|
281
|
-
if (
|
|
282
|
-
lf_tag.id.platform_instance
|
|
283
|
-
== lake_formation_tag_id.platform_instance
|
|
284
|
-
and lf_tag.id.catalog == lake_formation_tag_id.catalog
|
|
285
|
-
):
|
|
286
|
-
return lf_tag
|
|
249
|
+
"""Create a default Lake Formation tag entity when none found in DataHub."""
|
|
250
|
+
# Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
|
|
251
|
+
assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
|
|
252
|
+
f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
256
|
+
default_entity_id = LakeFormationTagPlatformResourceId(
|
|
257
|
+
tag_key=entity_id.tag_key,
|
|
258
|
+
tag_value=entity_id.tag_value,
|
|
259
|
+
platform_instance=entity_id.platform_instance,
|
|
260
|
+
catalog=entity_id.catalog,
|
|
261
|
+
exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
|
|
262
|
+
persisted=False, # New entities are not persisted yet
|
|
263
|
+
)
|
|
264
|
+
|
|
287
265
|
return cls(
|
|
288
|
-
id=
|
|
266
|
+
id=default_entity_id,
|
|
289
267
|
datahub_urns=LinkedResourceSet(urns=[]),
|
|
290
268
|
managed_by_datahub=managed_by_datahub,
|
|
291
269
|
allowed_values=None,
|
|
@@ -30,6 +30,7 @@ class DatasetSubTypes(StrEnum):
|
|
|
30
30
|
NEO4J_NODE = "Neo4j Node"
|
|
31
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
32
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
33
|
+
DYNAMIC_TABLE = "Dynamic Table"
|
|
33
34
|
API_ENDPOINT = "API Endpoint"
|
|
34
35
|
SLACK_CHANNEL = "Slack Channel"
|
|
35
36
|
PROJECTIONS = "Projections"
|
|
@@ -375,6 +375,7 @@ class HexApi:
|
|
|
375
375
|
description=hex_item.description,
|
|
376
376
|
created_at=hex_item.created_at,
|
|
377
377
|
last_edited_at=hex_item.last_edited_at,
|
|
378
|
+
last_published_at=hex_item.last_published_at,
|
|
378
379
|
status=status,
|
|
379
380
|
categories=categories,
|
|
380
381
|
collections=collections,
|
|
@@ -389,6 +390,7 @@ class HexApi:
|
|
|
389
390
|
description=hex_item.description,
|
|
390
391
|
created_at=hex_item.created_at,
|
|
391
392
|
last_edited_at=hex_item.last_edited_at,
|
|
393
|
+
last_published_at=hex_item.last_published_at,
|
|
392
394
|
status=status,
|
|
393
395
|
categories=categories,
|
|
394
396
|
collections=collections,
|
|
@@ -122,7 +122,7 @@ class Mapper:
|
|
|
122
122
|
lastModified=self._change_audit_stamps(
|
|
123
123
|
created_at=project.created_at, last_edited_at=project.last_edited_at
|
|
124
124
|
),
|
|
125
|
-
externalUrl=
|
|
125
|
+
externalUrl=self._get_project_or_component_external_url(project),
|
|
126
126
|
customProperties=dict(id=project.id),
|
|
127
127
|
datasetEdges=self._dataset_edges(project.upstream_datasets),
|
|
128
128
|
# TODO: support schema field upstream, maybe InputFields?
|
|
@@ -173,7 +173,7 @@ class Mapper:
|
|
|
173
173
|
lastModified=self._change_audit_stamps(
|
|
174
174
|
created_at=component.created_at, last_edited_at=component.last_edited_at
|
|
175
175
|
),
|
|
176
|
-
externalUrl=
|
|
176
|
+
externalUrl=self._get_project_or_component_external_url(component),
|
|
177
177
|
customProperties=dict(id=component.id),
|
|
178
178
|
)
|
|
179
179
|
|
|
@@ -242,6 +242,20 @@ class Mapper:
|
|
|
242
242
|
assert isinstance(dashboard_urn, DashboardUrn)
|
|
243
243
|
return dashboard_urn
|
|
244
244
|
|
|
245
|
+
def _get_project_or_component_external_url(
|
|
246
|
+
self,
|
|
247
|
+
project_or_component: Union[Project, Component],
|
|
248
|
+
) -> Optional[str]:
|
|
249
|
+
if project_or_component.last_published_at is None:
|
|
250
|
+
return (
|
|
251
|
+
f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
# published Projects/Components have a different URL that everybody, not just editors, can access
|
|
255
|
+
return (
|
|
256
|
+
f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
|
|
257
|
+
)
|
|
258
|
+
|
|
245
259
|
def _change_audit_stamps(
|
|
246
260
|
self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
|
|
247
261
|
) -> ChangeAuditStampsClass:
|
|
@@ -46,6 +46,7 @@ class Project:
|
|
|
46
46
|
title: str
|
|
47
47
|
description: Optional[str]
|
|
48
48
|
last_edited_at: Optional[datetime] = None
|
|
49
|
+
last_published_at: Optional[datetime] = None
|
|
49
50
|
created_at: Optional[datetime] = None
|
|
50
51
|
status: Optional[Status] = None
|
|
51
52
|
categories: Optional[List[Category]] = None # TODO: emit category description!
|
|
@@ -67,6 +68,7 @@ class Component:
|
|
|
67
68
|
title: str
|
|
68
69
|
description: Optional[str]
|
|
69
70
|
last_edited_at: Optional[datetime] = None
|
|
71
|
+
last_published_at: Optional[datetime] = None
|
|
70
72
|
created_at: Optional[datetime] = None
|
|
71
73
|
status: Optional[Status] = None
|
|
72
74
|
categories: Optional[List[Category]] = None
|
|
@@ -379,6 +379,14 @@ class ExploreUpstreamViewField:
|
|
|
379
379
|
: -(len(self.field.field_group_variant.lower()) + 1)
|
|
380
380
|
]
|
|
381
381
|
|
|
382
|
+
# Validate that field_name is not empty to prevent invalid schema field URNs
|
|
383
|
+
if not field_name or not field_name.strip():
|
|
384
|
+
logger.warning(
|
|
385
|
+
f"Empty field name detected for field '{self.field.name}' in explore '{self.explore.name}'. "
|
|
386
|
+
f"Skipping field to prevent invalid schema field URN generation."
|
|
387
|
+
)
|
|
388
|
+
return None
|
|
389
|
+
|
|
382
390
|
assert view_name # for lint false positive
|
|
383
391
|
|
|
384
392
|
project_include: ProjectInclude = ProjectInclude(
|
|
@@ -1351,7 +1359,25 @@ class LookerExplore:
|
|
|
1351
1359
|
fine_grained_lineages = []
|
|
1352
1360
|
if config.extract_column_level_lineage:
|
|
1353
1361
|
for field in self.fields or []:
|
|
1362
|
+
# Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
|
|
1363
|
+
if not field.name or not field.name.strip():
|
|
1364
|
+
logger.warning(
|
|
1365
|
+
f"Skipping fine-grained lineage for field with empty name in explore '{self.name}'"
|
|
1366
|
+
)
|
|
1367
|
+
continue
|
|
1368
|
+
|
|
1354
1369
|
for upstream_column_ref in field.upstream_fields:
|
|
1370
|
+
# Skip creating fine-grained lineage for empty column names to prevent invalid schema field URNs
|
|
1371
|
+
if (
|
|
1372
|
+
not upstream_column_ref.column
|
|
1373
|
+
or not upstream_column_ref.column.strip()
|
|
1374
|
+
):
|
|
1375
|
+
logger.warning(
|
|
1376
|
+
f"Skipping some fine-grained lineage for field '{field.name}' in explore '{self.name}' "
|
|
1377
|
+
f"due to empty upstream column name in table '{upstream_column_ref.table}'"
|
|
1378
|
+
)
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1355
1381
|
fine_grained_lineages.append(
|
|
1356
1382
|
FineGrainedLineageClass(
|
|
1357
1383
|
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
|
|
@@ -8,7 +8,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
8
8
|
)
|
|
9
9
|
from datahub.utilities.prefix_batch_builder import PrefixGroup
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
SHOW_COMMAND_MAX_PAGE_SIZE = 10000
|
|
12
12
|
SHOW_STREAM_MAX_PAGE_SIZE = 10000
|
|
13
13
|
|
|
14
14
|
|
|
@@ -38,12 +38,23 @@ class SnowflakeQuery:
|
|
|
38
38
|
SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
|
|
39
39
|
SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
|
|
40
40
|
SnowflakeObjectDomain.STREAM.capitalize(),
|
|
41
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
41
42
|
}
|
|
42
43
|
|
|
43
44
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
44
45
|
",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
|
|
45
46
|
)
|
|
46
47
|
|
|
48
|
+
# Domains that can be downstream tables in lineage
|
|
49
|
+
DOWNSTREAM_TABLE_DOMAINS = {
|
|
50
|
+
SnowflakeObjectDomain.TABLE.capitalize(),
|
|
51
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
DOWNSTREAM_TABLE_DOMAINS_FILTER = "({})".format(
|
|
55
|
+
",".join(f"'{domain}'" for domain in DOWNSTREAM_TABLE_DOMAINS)
|
|
56
|
+
)
|
|
57
|
+
|
|
47
58
|
@staticmethod
|
|
48
59
|
def current_account() -> str:
|
|
49
60
|
return "select CURRENT_ACCOUNT()"
|
|
@@ -235,7 +246,7 @@ class SnowflakeQuery:
|
|
|
235
246
|
@staticmethod
|
|
236
247
|
def show_views_for_database(
|
|
237
248
|
db_name: str,
|
|
238
|
-
limit: int =
|
|
249
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
239
250
|
view_pagination_marker: Optional[str] = None,
|
|
240
251
|
) -> str:
|
|
241
252
|
# While there is an information_schema.views view, that only shows the view definition if the role
|
|
@@ -244,7 +255,7 @@ class SnowflakeQuery:
|
|
|
244
255
|
|
|
245
256
|
# SHOW VIEWS can return a maximum of 10000 rows.
|
|
246
257
|
# https://docs.snowflake.com/en/sql-reference/sql/show-views#usage-notes
|
|
247
|
-
assert limit <=
|
|
258
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
248
259
|
|
|
249
260
|
# To work around this, we paginate through the results using the FROM clause.
|
|
250
261
|
from_clause = (
|
|
@@ -686,7 +697,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
686
697
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
687
698
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
688
699
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
689
|
-
AND downstream_table_domain
|
|
700
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
690
701
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
691
702
|
),
|
|
692
703
|
column_upstream_jobs AS (
|
|
@@ -843,7 +854,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
843
854
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
844
855
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
845
856
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
846
|
-
AND downstream_table_domain
|
|
857
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
847
858
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
848
859
|
),
|
|
849
860
|
table_upstream_jobs_unique AS (
|
|
@@ -940,3 +951,37 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
940
951
|
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
941
952
|
)
|
|
942
953
|
return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
|
|
954
|
+
|
|
955
|
+
@staticmethod
|
|
956
|
+
def show_dynamic_tables_for_database(
|
|
957
|
+
db_name: str,
|
|
958
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
959
|
+
dynamic_table_pagination_marker: Optional[str] = None,
|
|
960
|
+
) -> str:
|
|
961
|
+
"""Get dynamic table definitions using SHOW DYNAMIC TABLES."""
|
|
962
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
963
|
+
|
|
964
|
+
from_clause = (
|
|
965
|
+
f"""FROM '{dynamic_table_pagination_marker}'"""
|
|
966
|
+
if dynamic_table_pagination_marker
|
|
967
|
+
else ""
|
|
968
|
+
)
|
|
969
|
+
return f"""\
|
|
970
|
+
SHOW DYNAMIC TABLES IN DATABASE "{db_name}"
|
|
971
|
+
LIMIT {limit} {from_clause};
|
|
972
|
+
"""
|
|
973
|
+
|
|
974
|
+
@staticmethod
|
|
975
|
+
def get_dynamic_table_graph_history(db_name: str) -> str:
|
|
976
|
+
"""Get dynamic table dependency information from information schema."""
|
|
977
|
+
return f"""
|
|
978
|
+
SELECT
|
|
979
|
+
name,
|
|
980
|
+
inputs,
|
|
981
|
+
target_lag_type,
|
|
982
|
+
target_lag_sec,
|
|
983
|
+
scheduling_state,
|
|
984
|
+
alter_trigger
|
|
985
|
+
FROM TABLE("{db_name}".INFORMATION_SCHEMA.DYNAMIC_TABLE_GRAPH_HISTORY())
|
|
986
|
+
ORDER BY name
|
|
987
|
+
"""
|