acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/METADATA +2631 -2631
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/RECORD +41 -39
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import List, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.aws.platform_resource_repository import (
|
|
6
|
+
GluePlatformResourceRepository,
|
|
7
|
+
)
|
|
3
8
|
|
|
4
9
|
from pydantic import BaseModel
|
|
5
10
|
|
|
@@ -7,7 +12,6 @@ from datahub.api.entities.external.external_entities import (
|
|
|
7
12
|
ExternalEntity,
|
|
8
13
|
ExternalEntityId,
|
|
9
14
|
LinkedResourceSet,
|
|
10
|
-
PlatformResourceRepository,
|
|
11
15
|
)
|
|
12
16
|
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
13
17
|
LakeFormationTag,
|
|
@@ -15,10 +19,8 @@ from datahub.api.entities.external.lake_formation_external_entites import (
|
|
|
15
19
|
from datahub.api.entities.platformresource.platform_resource import (
|
|
16
20
|
PlatformResource,
|
|
17
21
|
PlatformResourceKey,
|
|
18
|
-
PlatformResourceSearchFields,
|
|
19
22
|
)
|
|
20
23
|
from datahub.metadata.urns import TagUrn
|
|
21
|
-
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
22
24
|
from datahub.utilities.urns.urn import Urn
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
@@ -29,8 +31,12 @@ class LakeFormationTagSyncContext(BaseModel):
|
|
|
29
31
|
platform_instance: Optional[str] = None
|
|
30
32
|
catalog: Optional[str] = None
|
|
31
33
|
|
|
34
|
+
# Making it compatible with SyncContext interface
|
|
35
|
+
def get_platform_instance(self) -> Optional[str]:
|
|
36
|
+
return self.platform_instance
|
|
37
|
+
|
|
32
38
|
|
|
33
|
-
class LakeFormationTagPlatformResourceId(
|
|
39
|
+
class LakeFormationTagPlatformResourceId(ExternalEntityId):
|
|
34
40
|
"""
|
|
35
41
|
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
36
42
|
"""
|
|
@@ -42,9 +48,6 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
42
48
|
exists_in_lake_formation: bool = False
|
|
43
49
|
persisted: bool = False
|
|
44
50
|
|
|
45
|
-
def __hash__(self) -> int:
|
|
46
|
-
return hash(self.to_platform_resource_key().id)
|
|
47
|
-
|
|
48
51
|
# this is a hack to make sure the property is a string and not private pydantic field
|
|
49
52
|
@staticmethod
|
|
50
53
|
def _RESOURCE_TYPE() -> str:
|
|
@@ -61,24 +64,26 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
61
64
|
)
|
|
62
65
|
|
|
63
66
|
@classmethod
|
|
64
|
-
def
|
|
67
|
+
def get_or_create_from_tag(
|
|
65
68
|
cls,
|
|
66
69
|
tag: LakeFormationTag,
|
|
67
|
-
|
|
68
|
-
platform_resource_repository: PlatformResourceRepository,
|
|
69
|
-
catalog: Optional[str] = None,
|
|
70
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
70
71
|
exists_in_lake_formation: bool = False,
|
|
72
|
+
catalog_id: Optional[str] = None,
|
|
71
73
|
) -> "LakeFormationTagPlatformResourceId":
|
|
72
74
|
"""
|
|
73
75
|
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
74
76
|
"""
|
|
75
77
|
|
|
78
|
+
# Use catalog_id if provided, otherwise fall back to repository catalog
|
|
79
|
+
effective_catalog = catalog_id or platform_resource_repository.catalog
|
|
80
|
+
|
|
76
81
|
existing_platform_resource = cls.search_by_urn(
|
|
77
82
|
tag.to_datahub_tag_urn().urn(),
|
|
78
83
|
platform_resource_repository=platform_resource_repository,
|
|
79
84
|
tag_sync_context=LakeFormationTagSyncContext(
|
|
80
|
-
platform_instance=platform_instance,
|
|
81
|
-
catalog=
|
|
85
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
86
|
+
catalog=effective_catalog,
|
|
82
87
|
),
|
|
83
88
|
)
|
|
84
89
|
if existing_platform_resource:
|
|
@@ -90,9 +95,9 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
90
95
|
return LakeFormationTagPlatformResourceId(
|
|
91
96
|
tag_key=str(tag.key),
|
|
92
97
|
tag_value=str(tag.value) if tag.value is not None else None,
|
|
93
|
-
platform_instance=platform_instance,
|
|
98
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
99
|
+
catalog=effective_catalog,
|
|
94
100
|
exists_in_lake_formation=exists_in_lake_formation,
|
|
95
|
-
catalog=catalog,
|
|
96
101
|
persisted=False,
|
|
97
102
|
)
|
|
98
103
|
|
|
@@ -100,64 +105,48 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
100
105
|
def search_by_urn(
|
|
101
106
|
cls,
|
|
102
107
|
urn: str,
|
|
103
|
-
platform_resource_repository:
|
|
108
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
104
109
|
tag_sync_context: LakeFormationTagSyncContext,
|
|
105
110
|
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
111
|
+
"""
|
|
112
|
+
Search for existing Lake Formation tag entity by URN using repository caching.
|
|
113
|
+
|
|
114
|
+
This method now delegates to the repository's search_entity_by_urn method to ensure
|
|
115
|
+
consistent caching behavior across all platform implementations.
|
|
116
|
+
"""
|
|
117
|
+
# Use repository's cached search method instead of duplicating search logic
|
|
118
|
+
existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
|
|
119
|
+
|
|
120
|
+
if existing_entity_id:
|
|
121
|
+
# Verify platform instance and catalog match
|
|
122
|
+
if (
|
|
123
|
+
existing_entity_id.platform_instance
|
|
124
|
+
== tag_sync_context.platform_instance
|
|
125
|
+
and existing_entity_id.catalog == tag_sync_context.catalog
|
|
126
|
+
):
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
|
|
115
129
|
)
|
|
116
|
-
|
|
117
|
-
|
|
130
|
+
# Create a new ID with the correct state instead of mutating
|
|
131
|
+
return LakeFormationTagPlatformResourceId(
|
|
132
|
+
tag_key=existing_entity_id.tag_key,
|
|
133
|
+
tag_value=existing_entity_id.tag_value,
|
|
134
|
+
platform_instance=existing_entity_id.platform_instance,
|
|
135
|
+
catalog=existing_entity_id.catalog,
|
|
136
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
137
|
+
persisted=True, # And it's persisted in DataHub
|
|
138
|
+
)
|
|
139
|
+
|
|
118
140
|
logger.info(
|
|
119
|
-
f"
|
|
141
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
120
142
|
)
|
|
121
|
-
if len(mapped_tags) > 0:
|
|
122
|
-
for platform_resource in mapped_tags:
|
|
123
|
-
if (
|
|
124
|
-
platform_resource.resource_info
|
|
125
|
-
and platform_resource.resource_info.value
|
|
126
|
-
):
|
|
127
|
-
lake_formation_tag_platform_resource = (
|
|
128
|
-
LakeFormationTagPlatformResource(
|
|
129
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
130
|
-
LakeFormationTagPlatformResource
|
|
131
|
-
).dict()
|
|
132
|
-
)
|
|
133
|
-
)
|
|
134
|
-
if (
|
|
135
|
-
lake_formation_tag_platform_resource.id.platform_instance
|
|
136
|
-
== tag_sync_context.platform_instance
|
|
137
|
-
and lake_formation_tag_platform_resource.id.catalog
|
|
138
|
-
== tag_sync_context.catalog
|
|
139
|
-
):
|
|
140
|
-
lake_formation_tag_id = lake_formation_tag_platform_resource.id
|
|
141
|
-
lake_formation_tag_id.exists_in_lake_formation = True
|
|
142
|
-
lake_formation_tag_id.persisted = True
|
|
143
|
-
return lake_formation_tag_id
|
|
144
|
-
else:
|
|
145
|
-
logger.warning(
|
|
146
|
-
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
147
|
-
)
|
|
148
|
-
continue
|
|
149
|
-
|
|
150
|
-
# If we reach here, it means we did not find a mapped tag for the URN
|
|
151
|
-
logger.info(
|
|
152
|
-
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
153
|
-
)
|
|
154
143
|
return None
|
|
155
144
|
|
|
156
145
|
@classmethod
|
|
157
146
|
def from_datahub_urn(
|
|
158
147
|
cls,
|
|
159
148
|
urn: str,
|
|
160
|
-
platform_resource_repository:
|
|
149
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
161
150
|
tag_sync_context: LakeFormationTagSyncContext,
|
|
162
151
|
) -> "LakeFormationTagPlatformResourceId":
|
|
163
152
|
"""
|
|
@@ -188,11 +177,17 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
188
177
|
logger.info(
|
|
189
178
|
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
190
179
|
)
|
|
191
|
-
|
|
192
|
-
|
|
180
|
+
# Create a new ID with the correct state instead of mutating
|
|
181
|
+
return LakeFormationTagPlatformResourceId(
|
|
182
|
+
tag_key=new_tag_id.tag_key,
|
|
183
|
+
tag_value=new_tag_id.tag_value,
|
|
184
|
+
platform_instance=new_tag_id.platform_instance,
|
|
185
|
+
catalog=new_tag_id.catalog,
|
|
186
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
187
|
+
persisted=new_tag_id.persisted,
|
|
193
188
|
)
|
|
194
189
|
return new_tag_id
|
|
195
|
-
raise ValueError(f"Unable to create
|
|
190
|
+
raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
|
|
196
191
|
|
|
197
192
|
@classmethod
|
|
198
193
|
def generate_tag_id(
|
|
@@ -223,7 +218,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
223
218
|
)
|
|
224
219
|
|
|
225
220
|
|
|
226
|
-
class LakeFormationTagPlatformResource(
|
|
221
|
+
class LakeFormationTagPlatformResource(ExternalEntity):
|
|
227
222
|
datahub_urns: LinkedResourceSet
|
|
228
223
|
managed_by_datahub: bool
|
|
229
224
|
id: LakeFormationTagPlatformResourceId
|
|
@@ -246,46 +241,29 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
|
|
|
246
241
|
)
|
|
247
242
|
|
|
248
243
|
@classmethod
|
|
249
|
-
def
|
|
244
|
+
def create_default(
|
|
250
245
|
cls,
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
managed_by_datahub: bool = False,
|
|
246
|
+
entity_id: ExternalEntityId,
|
|
247
|
+
managed_by_datahub: bool,
|
|
254
248
|
) -> "LakeFormationTagPlatformResource":
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
for platform_resource in platform_resources:
|
|
272
|
-
if (
|
|
273
|
-
platform_resource.resource_info
|
|
274
|
-
and platform_resource.resource_info.value
|
|
275
|
-
):
|
|
276
|
-
lf_tag = LakeFormationTagPlatformResource(
|
|
277
|
-
**platform_resource.resource_info.value.as_pydantic_object(
|
|
278
|
-
LakeFormationTagPlatformResource
|
|
279
|
-
).dict()
|
|
280
|
-
)
|
|
281
|
-
if (
|
|
282
|
-
lf_tag.id.platform_instance
|
|
283
|
-
== lake_formation_tag_id.platform_instance
|
|
284
|
-
and lf_tag.id.catalog == lake_formation_tag_id.catalog
|
|
285
|
-
):
|
|
286
|
-
return lf_tag
|
|
249
|
+
"""Create a default Lake Formation tag entity when none found in DataHub."""
|
|
250
|
+
# Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
|
|
251
|
+
assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
|
|
252
|
+
f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
256
|
+
default_entity_id = LakeFormationTagPlatformResourceId(
|
|
257
|
+
tag_key=entity_id.tag_key,
|
|
258
|
+
tag_value=entity_id.tag_value,
|
|
259
|
+
platform_instance=entity_id.platform_instance,
|
|
260
|
+
catalog=entity_id.catalog,
|
|
261
|
+
exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
|
|
262
|
+
persisted=False, # New entities are not persisted yet
|
|
263
|
+
)
|
|
264
|
+
|
|
287
265
|
return cls(
|
|
288
|
-
id=
|
|
266
|
+
id=default_entity_id,
|
|
289
267
|
datahub_urns=LinkedResourceSet(urns=[]),
|
|
290
268
|
managed_by_datahub=managed_by_datahub,
|
|
291
269
|
allowed_values=None,
|
|
@@ -30,6 +30,7 @@ class DatasetSubTypes(StrEnum):
|
|
|
30
30
|
NEO4J_NODE = "Neo4j Node"
|
|
31
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
32
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
33
|
+
DYNAMIC_TABLE = "Dynamic Table"
|
|
33
34
|
API_ENDPOINT = "API Endpoint"
|
|
34
35
|
SLACK_CHANNEL = "Slack Channel"
|
|
35
36
|
PROJECTIONS = "Projections"
|
|
@@ -375,6 +375,7 @@ class HexApi:
|
|
|
375
375
|
description=hex_item.description,
|
|
376
376
|
created_at=hex_item.created_at,
|
|
377
377
|
last_edited_at=hex_item.last_edited_at,
|
|
378
|
+
last_published_at=hex_item.last_published_at,
|
|
378
379
|
status=status,
|
|
379
380
|
categories=categories,
|
|
380
381
|
collections=collections,
|
|
@@ -389,6 +390,7 @@ class HexApi:
|
|
|
389
390
|
description=hex_item.description,
|
|
390
391
|
created_at=hex_item.created_at,
|
|
391
392
|
last_edited_at=hex_item.last_edited_at,
|
|
393
|
+
last_published_at=hex_item.last_published_at,
|
|
392
394
|
status=status,
|
|
393
395
|
categories=categories,
|
|
394
396
|
collections=collections,
|
|
@@ -122,7 +122,7 @@ class Mapper:
|
|
|
122
122
|
lastModified=self._change_audit_stamps(
|
|
123
123
|
created_at=project.created_at, last_edited_at=project.last_edited_at
|
|
124
124
|
),
|
|
125
|
-
externalUrl=
|
|
125
|
+
externalUrl=self._get_project_or_component_external_url(project),
|
|
126
126
|
customProperties=dict(id=project.id),
|
|
127
127
|
datasetEdges=self._dataset_edges(project.upstream_datasets),
|
|
128
128
|
# TODO: support schema field upstream, maybe InputFields?
|
|
@@ -173,7 +173,7 @@ class Mapper:
|
|
|
173
173
|
lastModified=self._change_audit_stamps(
|
|
174
174
|
created_at=component.created_at, last_edited_at=component.last_edited_at
|
|
175
175
|
),
|
|
176
|
-
externalUrl=
|
|
176
|
+
externalUrl=self._get_project_or_component_external_url(component),
|
|
177
177
|
customProperties=dict(id=component.id),
|
|
178
178
|
)
|
|
179
179
|
|
|
@@ -242,6 +242,20 @@ class Mapper:
|
|
|
242
242
|
assert isinstance(dashboard_urn, DashboardUrn)
|
|
243
243
|
return dashboard_urn
|
|
244
244
|
|
|
245
|
+
def _get_project_or_component_external_url(
|
|
246
|
+
self,
|
|
247
|
+
project_or_component: Union[Project, Component],
|
|
248
|
+
) -> Optional[str]:
|
|
249
|
+
if project_or_component.last_published_at is None:
|
|
250
|
+
return (
|
|
251
|
+
f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
# published Projects/Components have a different URL that everybody, not just editors, can access
|
|
255
|
+
return (
|
|
256
|
+
f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
|
|
257
|
+
)
|
|
258
|
+
|
|
245
259
|
def _change_audit_stamps(
|
|
246
260
|
self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
|
|
247
261
|
) -> ChangeAuditStampsClass:
|
|
@@ -46,6 +46,7 @@ class Project:
|
|
|
46
46
|
title: str
|
|
47
47
|
description: Optional[str]
|
|
48
48
|
last_edited_at: Optional[datetime] = None
|
|
49
|
+
last_published_at: Optional[datetime] = None
|
|
49
50
|
created_at: Optional[datetime] = None
|
|
50
51
|
status: Optional[Status] = None
|
|
51
52
|
categories: Optional[List[Category]] = None # TODO: emit category description!
|
|
@@ -67,6 +68,7 @@ class Component:
|
|
|
67
68
|
title: str
|
|
68
69
|
description: Optional[str]
|
|
69
70
|
last_edited_at: Optional[datetime] = None
|
|
71
|
+
last_published_at: Optional[datetime] = None
|
|
70
72
|
created_at: Optional[datetime] = None
|
|
71
73
|
status: Optional[Status] = None
|
|
72
74
|
categories: Optional[List[Category]] = None
|
|
@@ -379,6 +379,14 @@ class ExploreUpstreamViewField:
|
|
|
379
379
|
: -(len(self.field.field_group_variant.lower()) + 1)
|
|
380
380
|
]
|
|
381
381
|
|
|
382
|
+
# Validate that field_name is not empty to prevent invalid schema field URNs
|
|
383
|
+
if not field_name or not field_name.strip():
|
|
384
|
+
logger.warning(
|
|
385
|
+
f"Empty field name detected for field '{self.field.name}' in explore '{self.explore.name}'. "
|
|
386
|
+
f"Skipping field to prevent invalid schema field URN generation."
|
|
387
|
+
)
|
|
388
|
+
return None
|
|
389
|
+
|
|
382
390
|
assert view_name # for lint false positive
|
|
383
391
|
|
|
384
392
|
project_include: ProjectInclude = ProjectInclude(
|
|
@@ -1351,7 +1359,25 @@ class LookerExplore:
|
|
|
1351
1359
|
fine_grained_lineages = []
|
|
1352
1360
|
if config.extract_column_level_lineage:
|
|
1353
1361
|
for field in self.fields or []:
|
|
1362
|
+
# Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
|
|
1363
|
+
if not field.name or not field.name.strip():
|
|
1364
|
+
logger.warning(
|
|
1365
|
+
f"Skipping fine-grained lineage for field with empty name in explore '{self.name}'"
|
|
1366
|
+
)
|
|
1367
|
+
continue
|
|
1368
|
+
|
|
1354
1369
|
for upstream_column_ref in field.upstream_fields:
|
|
1370
|
+
# Skip creating fine-grained lineage for empty column names to prevent invalid schema field URNs
|
|
1371
|
+
if (
|
|
1372
|
+
not upstream_column_ref.column
|
|
1373
|
+
or not upstream_column_ref.column.strip()
|
|
1374
|
+
):
|
|
1375
|
+
logger.warning(
|
|
1376
|
+
f"Skipping some fine-grained lineage for field '{field.name}' in explore '{self.name}' "
|
|
1377
|
+
f"due to empty upstream column name in table '{upstream_column_ref.table}'"
|
|
1378
|
+
)
|
|
1379
|
+
continue
|
|
1380
|
+
|
|
1355
1381
|
fine_grained_lineages.append(
|
|
1356
1382
|
FineGrainedLineageClass(
|
|
1357
1383
|
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from pydantic import Field
|
|
5
5
|
|
|
@@ -14,6 +14,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
14
14
|
support_status,
|
|
15
15
|
)
|
|
16
16
|
from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
|
|
17
|
+
from datahub.ingestion.api.source_helpers import AutoSystemMetadata, auto_workunit
|
|
17
18
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
19
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
19
20
|
from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
@@ -31,6 +32,7 @@ from datahub.metadata.schema_classes import (
|
|
|
31
32
|
UpstreamClass,
|
|
32
33
|
UpstreamLineageClass,
|
|
33
34
|
)
|
|
35
|
+
from datahub.sdk.entity import Entity
|
|
34
36
|
from datahub.utilities.str_enum import StrEnum
|
|
35
37
|
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
@@ -165,6 +167,14 @@ class DataHubMockDataSource(Source):
|
|
|
165
167
|
self.report = DataHubMockDataReport()
|
|
166
168
|
|
|
167
169
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
170
|
+
workunit_processors = [AutoSystemMetadata(self.ctx).stamp]
|
|
171
|
+
return self._apply_workunit_processors(
|
|
172
|
+
workunit_processors, auto_workunit(self.get_workunits_internal())
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def get_workunits_internal(
|
|
176
|
+
self,
|
|
177
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
|
|
168
178
|
# We don't want any implicit aspects to be produced
|
|
169
179
|
# so we are not using get_workunits_internal
|
|
170
180
|
|
|
@@ -356,11 +356,18 @@ class SnowflakeV2Config(
|
|
|
356
356
|
|
|
357
357
|
pushdown_deny_usernames: List[str] = Field(
|
|
358
358
|
default=[],
|
|
359
|
-
description="List of snowflake usernames which will
|
|
359
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
|
|
360
360
|
"This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
|
|
361
361
|
"Only applicable if `use_queries_v2` is enabled.",
|
|
362
362
|
)
|
|
363
363
|
|
|
364
|
+
pushdown_allow_usernames: List[str] = Field(
|
|
365
|
+
default=[],
|
|
366
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
|
|
367
|
+
"This is primarily useful for improving performance by filtering in only specific users. "
|
|
368
|
+
"Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
|
|
369
|
+
)
|
|
370
|
+
|
|
364
371
|
push_down_database_pattern_access_history: bool = Field(
|
|
365
372
|
default=False,
|
|
366
373
|
description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
|
|
@@ -89,10 +89,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
89
89
|
|
|
90
90
|
pushdown_deny_usernames: List[str] = pydantic.Field(
|
|
91
91
|
default=[],
|
|
92
|
-
description="List of snowflake usernames which will
|
|
92
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
|
|
93
93
|
"This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
+
pushdown_allow_usernames: List[str] = pydantic.Field(
|
|
97
|
+
default=[],
|
|
98
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
|
|
99
|
+
"This is primarily useful for improving performance by filtering in only specific users. "
|
|
100
|
+
"If not specified, all users not in deny list are included.",
|
|
101
|
+
)
|
|
102
|
+
|
|
96
103
|
user_email_pattern: AllowDenyPattern = pydantic.Field(
|
|
97
104
|
default=AllowDenyPattern.allow_all(),
|
|
98
105
|
description="Regex patterns for user emails to filter in usage.",
|
|
@@ -396,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
396
403
|
end_time=self.config.window.end_time,
|
|
397
404
|
bucket_duration=self.config.window.bucket_duration,
|
|
398
405
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
406
|
+
allow_usernames=self.config.pushdown_allow_usernames,
|
|
399
407
|
dedup_strategy=self.config.query_dedup_strategy,
|
|
400
408
|
database_pattern=self.filters.filter_config.database_pattern
|
|
401
409
|
if self.config.push_down_database_pattern_access_history
|
|
@@ -740,7 +748,8 @@ class QueryLogQueryBuilder:
|
|
|
740
748
|
start_time: datetime,
|
|
741
749
|
end_time: datetime,
|
|
742
750
|
bucket_duration: BucketDuration,
|
|
743
|
-
deny_usernames: Optional[List[str]],
|
|
751
|
+
deny_usernames: Optional[List[str]] = None,
|
|
752
|
+
allow_usernames: Optional[List[str]] = None,
|
|
744
753
|
max_tables_per_query: int = 20,
|
|
745
754
|
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
746
755
|
database_pattern: Optional[AllowDenyPattern] = None,
|
|
@@ -753,10 +762,7 @@ class QueryLogQueryBuilder:
|
|
|
753
762
|
self.max_tables_per_query = max_tables_per_query
|
|
754
763
|
self.dedup_strategy = dedup_strategy
|
|
755
764
|
|
|
756
|
-
self.users_filter =
|
|
757
|
-
if deny_usernames:
|
|
758
|
-
user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
|
|
759
|
-
self.users_filter = f"user_name NOT IN ({user_not_in})"
|
|
765
|
+
self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
|
|
760
766
|
|
|
761
767
|
self.access_history_database_filter = (
|
|
762
768
|
self._build_access_history_database_filter_condition(
|
|
@@ -767,6 +773,43 @@ class QueryLogQueryBuilder:
|
|
|
767
773
|
self.time_bucket_size = bucket_duration.value
|
|
768
774
|
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
769
775
|
|
|
776
|
+
def _build_user_filter(
|
|
777
|
+
self,
|
|
778
|
+
deny_usernames: Optional[List[str]] = None,
|
|
779
|
+
allow_usernames: Optional[List[str]] = None,
|
|
780
|
+
) -> str:
|
|
781
|
+
"""
|
|
782
|
+
Build user filter SQL condition based on deny and allow username patterns.
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
|
|
786
|
+
allow_usernames: List of username patterns to include (SQL LIKE patterns)
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
SQL WHERE condition string for filtering users
|
|
790
|
+
"""
|
|
791
|
+
user_filters = []
|
|
792
|
+
|
|
793
|
+
if deny_usernames:
|
|
794
|
+
deny_conditions = []
|
|
795
|
+
for pattern in deny_usernames:
|
|
796
|
+
# Escape single quotes for SQL safety
|
|
797
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
798
|
+
deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
|
|
799
|
+
if deny_conditions:
|
|
800
|
+
user_filters.append(f"({' AND '.join(deny_conditions)})")
|
|
801
|
+
|
|
802
|
+
if allow_usernames:
|
|
803
|
+
allow_conditions = []
|
|
804
|
+
for pattern in allow_usernames:
|
|
805
|
+
# Escape single quotes for SQL safety
|
|
806
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
807
|
+
allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
|
|
808
|
+
if allow_conditions:
|
|
809
|
+
user_filters.append(f"({' OR '.join(allow_conditions)})")
|
|
810
|
+
|
|
811
|
+
return " AND ".join(user_filters) if user_filters else "TRUE"
|
|
812
|
+
|
|
770
813
|
def _build_access_history_database_filter_condition(
|
|
771
814
|
self,
|
|
772
815
|
database_pattern: Optional[AllowDenyPattern],
|