acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
|
|
|
84
84
|
tenant_id: str,
|
|
85
85
|
metadata_api_timeout: int,
|
|
86
86
|
):
|
|
87
|
-
self.
|
|
88
|
-
self.
|
|
89
|
-
|
|
87
|
+
self._access_token: Optional[str] = None
|
|
88
|
+
self._access_token_expiry_time: Optional[datetime] = None
|
|
89
|
+
|
|
90
|
+
self._tenant_id = tenant_id
|
|
90
91
|
# Test connection by generating access token
|
|
91
92
|
logger.info(f"Trying to connect to {self._get_authority_url()}")
|
|
92
93
|
# Power-Bi Auth (Service Principal Auth)
|
|
93
|
-
self.
|
|
94
|
+
self._msal_client = msal.ConfidentialClientApplication(
|
|
94
95
|
client_id,
|
|
95
96
|
client_credential=client_secret,
|
|
96
97
|
authority=DataResolverBase.AUTHORITY + tenant_id,
|
|
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
|
|
|
168
169
|
pass
|
|
169
170
|
|
|
170
171
|
def _get_authority_url(self):
|
|
171
|
-
return f"{DataResolverBase.AUTHORITY}{self.
|
|
172
|
+
return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
|
|
172
173
|
|
|
173
174
|
def get_authorization_header(self):
|
|
174
175
|
return {Constant.Authorization: self.get_access_token()}
|
|
175
176
|
|
|
176
|
-
def get_access_token(self):
|
|
177
|
-
if self.
|
|
178
|
-
return self.
|
|
177
|
+
def get_access_token(self) -> str:
|
|
178
|
+
if self._access_token is not None and not self._is_access_token_expired():
|
|
179
|
+
return self._access_token
|
|
179
180
|
|
|
180
181
|
logger.info("Generating PowerBi access token")
|
|
181
182
|
|
|
182
|
-
auth_response = self.
|
|
183
|
+
auth_response = self._msal_client.acquire_token_for_client(
|
|
183
184
|
scopes=[DataResolverBase.SCOPE]
|
|
184
185
|
)
|
|
185
186
|
|
|
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
|
|
|
193
194
|
|
|
194
195
|
logger.info("Generated PowerBi access token")
|
|
195
196
|
|
|
196
|
-
self.
|
|
197
|
+
self._access_token = "Bearer {}".format(
|
|
197
198
|
auth_response.get(Constant.ACCESS_TOKEN)
|
|
198
199
|
)
|
|
199
200
|
safety_gap = 300
|
|
200
|
-
self.
|
|
201
|
+
self._access_token_expiry_time = datetime.now() + timedelta(
|
|
201
202
|
seconds=(
|
|
202
203
|
max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
|
|
203
204
|
)
|
|
204
205
|
)
|
|
205
206
|
|
|
206
|
-
logger.debug(f"{Constant.PBIAccessToken}={self.
|
|
207
|
+
logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
|
|
207
208
|
|
|
208
|
-
return self.
|
|
209
|
+
return self._access_token
|
|
209
210
|
|
|
210
211
|
def _is_access_token_expired(self) -> bool:
|
|
211
|
-
if not self.
|
|
212
|
+
if not self._access_token_expiry_time:
|
|
212
213
|
return True
|
|
213
|
-
return self.
|
|
214
|
+
return self._access_token_expiry_time < datetime.now()
|
|
214
215
|
|
|
215
216
|
def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
|
|
216
217
|
"""
|
|
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
225
225
|
self.init_spark()
|
|
226
226
|
|
|
227
227
|
def init_spark(self):
|
|
228
|
-
os.environ.setdefault("SPARK_VERSION", "3.
|
|
228
|
+
os.environ.setdefault("SPARK_VERSION", "3.5")
|
|
229
229
|
spark_version = os.environ["SPARK_VERSION"]
|
|
230
230
|
|
|
231
231
|
# Importing here to avoid Deequ dependency for non profiling use cases
|
|
@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
|
|
|
138
138
|
description="Whether to convert dataset urns to lowercase.",
|
|
139
139
|
)
|
|
140
140
|
|
|
141
|
-
|
|
142
|
-
class SnowflakeUsageConfig(BaseUsageConfig):
|
|
143
141
|
email_domain: Optional[str] = pydantic.Field(
|
|
144
142
|
default=None,
|
|
145
143
|
description="Email domain of your organization so users can be displayed on UI appropriately.",
|
|
146
144
|
)
|
|
145
|
+
|
|
146
|
+
email_as_user_identifier: bool = Field(
|
|
147
|
+
default=True,
|
|
148
|
+
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
|
|
149
|
+
"provided, generates email addresses for snowflake users with unset emails, based on their "
|
|
150
|
+
"username.",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class SnowflakeUsageConfig(BaseUsageConfig):
|
|
147
155
|
apply_view_usage_to_tables: bool = pydantic.Field(
|
|
148
156
|
default=False,
|
|
149
157
|
description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
|
|
@@ -163,26 +171,13 @@ class SnowflakeConfig(
|
|
|
163
171
|
default=True,
|
|
164
172
|
description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.",
|
|
165
173
|
)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
)
|
|
174
|
+
|
|
175
|
+
_include_view_lineage = pydantic_removed_field("include_view_lineage")
|
|
176
|
+
_include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
|
|
170
177
|
|
|
171
178
|
ignore_start_time_lineage: bool = False
|
|
172
179
|
upstream_lineage_in_report: bool = False
|
|
173
180
|
|
|
174
|
-
@pydantic.root_validator(skip_on_failure=True)
|
|
175
|
-
def validate_include_view_lineage(cls, values):
|
|
176
|
-
if (
|
|
177
|
-
"include_table_lineage" in values
|
|
178
|
-
and not values.get("include_table_lineage")
|
|
179
|
-
and values.get("include_view_lineage")
|
|
180
|
-
):
|
|
181
|
-
raise ValueError(
|
|
182
|
-
"include_table_lineage must be True for include_view_lineage to be set."
|
|
183
|
-
)
|
|
184
|
-
return values
|
|
185
|
-
|
|
186
181
|
|
|
187
182
|
class SnowflakeV2Config(
|
|
188
183
|
SnowflakeConfig,
|
|
@@ -222,11 +217,6 @@ class SnowflakeV2Config(
|
|
|
222
217
|
description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.",
|
|
223
218
|
)
|
|
224
219
|
|
|
225
|
-
include_view_column_lineage: bool = Field(
|
|
226
|
-
default=True,
|
|
227
|
-
description="Populates view->view and table->view column lineage using DataHub's sql parser.",
|
|
228
|
-
)
|
|
229
|
-
|
|
230
220
|
use_queries_v2: bool = Field(
|
|
231
221
|
default=False,
|
|
232
222
|
description="If enabled, uses the new queries extractor to extract queries from snowflake.",
|
|
@@ -285,13 +275,6 @@ class SnowflakeV2Config(
|
|
|
285
275
|
" Map of share name -> details of share.",
|
|
286
276
|
)
|
|
287
277
|
|
|
288
|
-
email_as_user_identifier: bool = Field(
|
|
289
|
-
default=True,
|
|
290
|
-
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
|
|
291
|
-
"provided, generates email addresses for snowflake users with unset emails, based on their "
|
|
292
|
-
"username.",
|
|
293
|
-
)
|
|
294
|
-
|
|
295
278
|
include_assertion_results: bool = Field(
|
|
296
279
|
default=False,
|
|
297
280
|
description="Whether to ingest assertion run results for assertions created using Datahub"
|
|
@@ -355,10 +338,6 @@ class SnowflakeV2Config(
|
|
|
355
338
|
self, database=database, username=username, password=password, role=role
|
|
356
339
|
)
|
|
357
340
|
|
|
358
|
-
@property
|
|
359
|
-
def parse_view_ddl(self) -> bool:
|
|
360
|
-
return self.include_view_column_lineage
|
|
361
|
-
|
|
362
341
|
@validator("shares")
|
|
363
342
|
def validate_shares(
|
|
364
343
|
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
|
|
@@ -4,11 +4,10 @@ from dataclasses import dataclass
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type
|
|
6
6
|
|
|
7
|
-
from pydantic import BaseModel, validator
|
|
7
|
+
from pydantic import BaseModel, Field, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.datetimes import parse_absolute_time
|
|
10
10
|
from datahub.ingestion.api.closeable import Closeable
|
|
11
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
12
11
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
|
|
13
12
|
from datahub.ingestion.source.snowflake.constants import (
|
|
14
13
|
LINEAGE_PERMISSION_ERROR,
|
|
@@ -41,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
41
40
|
ColumnRef,
|
|
42
41
|
DownstreamColumnRef,
|
|
43
42
|
)
|
|
43
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
44
44
|
from datahub.utilities.perf_timer import PerfTimer
|
|
45
45
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
46
46
|
|
|
@@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel):
|
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
class ColumnUpstreamLineage(BaseModel):
|
|
75
|
-
column_name: str
|
|
76
|
-
upstreams: List[ColumnUpstreamJob]
|
|
75
|
+
column_name: Optional[str]
|
|
76
|
+
upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
class UpstreamTableNode(BaseModel):
|
|
@@ -163,11 +163,11 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
163
163
|
self.config.end_time,
|
|
164
164
|
)
|
|
165
165
|
|
|
166
|
-
def
|
|
166
|
+
def add_time_based_lineage_to_aggregator(
|
|
167
167
|
self,
|
|
168
168
|
discovered_tables: List[str],
|
|
169
169
|
discovered_views: List[str],
|
|
170
|
-
) ->
|
|
170
|
+
) -> None:
|
|
171
171
|
if not self._should_ingest_lineage():
|
|
172
172
|
return
|
|
173
173
|
|
|
@@ -177,9 +177,7 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
177
177
|
# snowflake view/table -> snowflake table
|
|
178
178
|
self.populate_table_upstreams(discovered_tables)
|
|
179
179
|
|
|
180
|
-
|
|
181
|
-
yield mcp.as_workunit()
|
|
182
|
-
|
|
180
|
+
def update_state(self):
|
|
183
181
|
if self.redundant_run_skip_handler:
|
|
184
182
|
# Update the checkpoint state for this run.
|
|
185
183
|
self.redundant_run_skip_handler.update_state(
|
|
@@ -242,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
242
240
|
downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
|
|
243
241
|
|
|
244
242
|
known_lineage = KnownQueryLineageInfo(
|
|
243
|
+
query_id=get_query_fingerprint(
|
|
244
|
+
query.query_text, self.identifiers.platform, fast=True
|
|
245
|
+
),
|
|
245
246
|
query_text=query.query_text,
|
|
246
247
|
downstream=downstream_table_urn,
|
|
247
248
|
upstreams=self.map_query_result_upstreams(
|
|
@@ -265,64 +266,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
265
266
|
with PerfTimer() as timer:
|
|
266
267
|
self.report.num_external_table_edges_scanned = 0
|
|
267
268
|
|
|
268
|
-
for (
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
self.sql_aggregator.add(known_lineage_mapping)
|
|
272
|
-
logger.info(
|
|
273
|
-
"Done populating external lineage from copy history. "
|
|
274
|
-
f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
for (
|
|
278
|
-
known_lineage_mapping
|
|
279
|
-
) in self._populate_external_lineage_from_show_query(discovered_tables):
|
|
280
|
-
self.sql_aggregator.add(known_lineage_mapping)
|
|
281
|
-
|
|
282
|
-
logger.info(
|
|
283
|
-
"Done populating external lineage from show external tables. "
|
|
284
|
-
f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
|
|
285
|
-
)
|
|
269
|
+
for entry in self._get_copy_history_lineage(discovered_tables):
|
|
270
|
+
self.sql_aggregator.add(entry)
|
|
271
|
+
logger.info("Done populating external lineage from copy history. ")
|
|
286
272
|
|
|
287
273
|
self.report.external_lineage_queries_secs = timer.elapsed_seconds()
|
|
288
274
|
|
|
289
|
-
# Handles the case for explicitly created external tables.
|
|
290
|
-
# NOTE: Snowflake does not log this information to the access_history table.
|
|
291
|
-
def _populate_external_lineage_from_show_query(
|
|
292
|
-
self, discovered_tables: List[str]
|
|
293
|
-
) -> Iterable[KnownLineageMapping]:
|
|
294
|
-
external_tables_query: str = SnowflakeQuery.show_external_tables()
|
|
295
|
-
try:
|
|
296
|
-
for db_row in self.connection.query(external_tables_query):
|
|
297
|
-
key = self.identifiers.get_dataset_identifier(
|
|
298
|
-
db_row["name"], db_row["schema_name"], db_row["database_name"]
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
if key not in discovered_tables:
|
|
302
|
-
continue
|
|
303
|
-
if db_row["location"].startswith("s3://"):
|
|
304
|
-
yield KnownLineageMapping(
|
|
305
|
-
upstream_urn=make_s3_urn_for_lineage(
|
|
306
|
-
db_row["location"], self.config.env
|
|
307
|
-
),
|
|
308
|
-
downstream_urn=self.identifiers.gen_dataset_urn(key),
|
|
309
|
-
)
|
|
310
|
-
self.report.num_external_table_edges_scanned += 1
|
|
311
|
-
|
|
312
|
-
self.report.num_external_table_edges_scanned += 1
|
|
313
|
-
except Exception as e:
|
|
314
|
-
logger.debug(e, exc_info=e)
|
|
315
|
-
self.structured_reporter.warning(
|
|
316
|
-
"Error populating external table lineage from Snowflake",
|
|
317
|
-
exc=e,
|
|
318
|
-
)
|
|
319
|
-
self.report_status(EXTERNAL_LINEAGE, False)
|
|
320
|
-
|
|
321
275
|
# Handles the case where a table is populated from an external stage/s3 location via copy.
|
|
322
276
|
# Eg: copy into category_english from @external_s3_stage;
|
|
323
277
|
# Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
|
|
324
278
|
# NOTE: Snowflake does not log this information to the access_history table.
|
|
325
|
-
def
|
|
279
|
+
def _get_copy_history_lineage(
|
|
326
280
|
self, discovered_tables: List[str]
|
|
327
281
|
) -> Iterable[KnownLineageMapping]:
|
|
328
282
|
query: str = SnowflakeQuery.copy_lineage_history(
|
|
@@ -384,10 +338,6 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
384
338
|
start_time_millis=int(self.start_time.timestamp() * 1000),
|
|
385
339
|
end_time_millis=int(self.end_time.timestamp() * 1000),
|
|
386
340
|
upstreams_deny_pattern=self.config.temporary_tables_pattern,
|
|
387
|
-
# The self.config.include_view_lineage setting is about fetching upstreams of views.
|
|
388
|
-
# We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False.
|
|
389
|
-
# TODO: Remove this `include_view_lineage` flag, since it's effectively dead code.
|
|
390
|
-
include_view_lineage=True,
|
|
391
341
|
include_column_lineage=self.config.include_column_lineage,
|
|
392
342
|
)
|
|
393
343
|
try:
|
|
@@ -61,11 +61,17 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
61
61
|
ColumnRef,
|
|
62
62
|
DownstreamColumnRef,
|
|
63
63
|
)
|
|
64
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
64
65
|
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
|
|
65
66
|
from datahub.utilities.perf_timer import PerfTimer
|
|
66
67
|
|
|
67
68
|
logger = logging.getLogger(__name__)
|
|
68
69
|
|
|
70
|
+
# Define a type alias
|
|
71
|
+
UserName = str
|
|
72
|
+
UserEmail = str
|
|
73
|
+
UsersMapping = Dict[UserName, UserEmail]
|
|
74
|
+
|
|
69
75
|
|
|
70
76
|
class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
71
77
|
# TODO: Support stateful ingestion for the time windows.
|
|
@@ -114,11 +120,13 @@ class SnowflakeQueriesSourceConfig(
|
|
|
114
120
|
class SnowflakeQueriesExtractorReport(Report):
|
|
115
121
|
copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
116
122
|
query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
123
|
+
users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
117
124
|
|
|
118
125
|
audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
119
126
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
120
127
|
|
|
121
128
|
num_ddl_queries_dropped: int = 0
|
|
129
|
+
num_users: int = 0
|
|
122
130
|
|
|
123
131
|
|
|
124
132
|
@dataclass
|
|
@@ -225,6 +233,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
225
233
|
def get_workunits_internal(
|
|
226
234
|
self,
|
|
227
235
|
) -> Iterable[MetadataWorkUnit]:
|
|
236
|
+
with self.report.users_fetch_timer:
|
|
237
|
+
users = self.fetch_users()
|
|
238
|
+
|
|
228
239
|
# TODO: Add some logic to check if the cached audit log is stale or not.
|
|
229
240
|
audit_log_file = self.local_temp_path / "audit_log.sqlite"
|
|
230
241
|
use_cached_audit_log = audit_log_file.exists()
|
|
@@ -247,11 +258,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
247
258
|
for entry in self.fetch_copy_history():
|
|
248
259
|
queries.append(entry)
|
|
249
260
|
|
|
250
|
-
# TODO: Add "show external tables" lineage to the main schema extractor.
|
|
251
|
-
# Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
|
|
252
|
-
|
|
253
261
|
with self.report.query_log_fetch_timer:
|
|
254
|
-
for entry in self.fetch_query_log():
|
|
262
|
+
for entry in self.fetch_query_log(users):
|
|
255
263
|
queries.append(entry)
|
|
256
264
|
|
|
257
265
|
with self.report.audit_log_load_timer:
|
|
@@ -266,6 +274,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
266
274
|
shared_connection.close()
|
|
267
275
|
audit_log_file.unlink(missing_ok=True)
|
|
268
276
|
|
|
277
|
+
def fetch_users(self) -> UsersMapping:
|
|
278
|
+
users: UsersMapping = dict()
|
|
279
|
+
with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
|
|
280
|
+
logger.info("Fetching users from Snowflake")
|
|
281
|
+
query = SnowflakeQuery.get_all_users()
|
|
282
|
+
resp = self.connection.query(query)
|
|
283
|
+
|
|
284
|
+
for row in resp:
|
|
285
|
+
try:
|
|
286
|
+
users[row["NAME"]] = row["EMAIL"]
|
|
287
|
+
self.report.num_users += 1
|
|
288
|
+
except Exception as e:
|
|
289
|
+
self.structured_reporter.warning(
|
|
290
|
+
"Error parsing user row",
|
|
291
|
+
context=f"{row}",
|
|
292
|
+
exc=e,
|
|
293
|
+
)
|
|
294
|
+
return users
|
|
295
|
+
|
|
269
296
|
def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
|
|
270
297
|
# Derived from _populate_external_lineage_from_copy_history.
|
|
271
298
|
|
|
@@ -301,7 +328,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
301
328
|
yield result
|
|
302
329
|
|
|
303
330
|
def fetch_query_log(
|
|
304
|
-
self,
|
|
331
|
+
self, users: UsersMapping
|
|
305
332
|
) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
|
|
306
333
|
query_log_query = _build_enriched_query_log_query(
|
|
307
334
|
start_time=self.config.window.start_time,
|
|
@@ -322,7 +349,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
322
349
|
|
|
323
350
|
assert isinstance(row, dict)
|
|
324
351
|
try:
|
|
325
|
-
entry = self._parse_audit_log_row(row)
|
|
352
|
+
entry = self._parse_audit_log_row(row, users)
|
|
326
353
|
except Exception as e:
|
|
327
354
|
self.structured_reporter.warning(
|
|
328
355
|
"Error parsing query log row",
|
|
@@ -334,7 +361,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
334
361
|
yield entry
|
|
335
362
|
|
|
336
363
|
def _parse_audit_log_row(
|
|
337
|
-
self, row: Dict[str, Any]
|
|
364
|
+
self, row: Dict[str, Any], users: UsersMapping
|
|
338
365
|
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
|
|
339
366
|
json_fields = {
|
|
340
367
|
"DIRECT_OBJECTS_ACCESSED",
|
|
@@ -433,9 +460,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
433
460
|
)
|
|
434
461
|
)
|
|
435
462
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
463
|
+
user = CorpUserUrn(
|
|
464
|
+
self.identifiers.get_user_identifier(
|
|
465
|
+
res["user_name"], users.get(res["user_name"])
|
|
466
|
+
)
|
|
467
|
+
)
|
|
439
468
|
|
|
440
469
|
timestamp: datetime = res["query_start_time"]
|
|
441
470
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
@@ -447,10 +476,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
447
476
|
|
|
448
477
|
entry = PreparsedQuery(
|
|
449
478
|
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
|
|
450
|
-
# job at eliminating redundant / repetitive queries. As such, we
|
|
451
|
-
# here
|
|
452
|
-
|
|
453
|
-
|
|
479
|
+
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
480
|
+
# here
|
|
481
|
+
query_id=get_query_fingerprint(
|
|
482
|
+
res["query_text"], self.identifiers.platform, fast=True
|
|
483
|
+
),
|
|
454
484
|
query_text=res["query_text"],
|
|
455
485
|
upstreams=upstreams,
|
|
456
486
|
downstream=downstream,
|
|
@@ -376,7 +376,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
376
376
|
def table_to_table_lineage_history_v2(
|
|
377
377
|
start_time_millis: int,
|
|
378
378
|
end_time_millis: int,
|
|
379
|
-
include_view_lineage: bool = True,
|
|
380
379
|
include_column_lineage: bool = True,
|
|
381
380
|
upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS,
|
|
382
381
|
) -> str:
|
|
@@ -385,14 +384,12 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
385
384
|
start_time_millis,
|
|
386
385
|
end_time_millis,
|
|
387
386
|
upstreams_deny_pattern,
|
|
388
|
-
include_view_lineage,
|
|
389
387
|
)
|
|
390
388
|
else:
|
|
391
389
|
return SnowflakeQuery.table_upstreams_only(
|
|
392
390
|
start_time_millis,
|
|
393
391
|
end_time_millis,
|
|
394
392
|
upstreams_deny_pattern,
|
|
395
|
-
include_view_lineage,
|
|
396
393
|
)
|
|
397
394
|
|
|
398
395
|
@staticmethod
|
|
@@ -677,12 +674,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
677
674
|
start_time_millis: int,
|
|
678
675
|
end_time_millis: int,
|
|
679
676
|
upstreams_deny_pattern: List[str],
|
|
680
|
-
include_view_lineage: bool = True,
|
|
681
677
|
) -> str:
|
|
682
678
|
allowed_upstream_table_domains = (
|
|
683
679
|
SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
|
|
684
|
-
if include_view_lineage
|
|
685
|
-
else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
|
|
686
680
|
)
|
|
687
681
|
|
|
688
682
|
upstream_sql_filter = create_deny_regex_sql_filter(
|
|
@@ -847,12 +841,9 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
847
841
|
start_time_millis: int,
|
|
848
842
|
end_time_millis: int,
|
|
849
843
|
upstreams_deny_pattern: List[str],
|
|
850
|
-
include_view_lineage: bool = True,
|
|
851
844
|
) -> str:
|
|
852
845
|
allowed_upstream_table_domains = (
|
|
853
846
|
SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER
|
|
854
|
-
if include_view_lineage
|
|
855
|
-
else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
|
|
856
847
|
)
|
|
857
848
|
|
|
858
849
|
upstream_sql_filter = create_deny_regex_sql_filter(
|
|
@@ -956,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
956
947
|
AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
|
|
957
948
|
ORDER BY MEASUREMENT_TIME ASC;
|
|
958
949
|
|
|
959
|
-
"""
|
|
950
|
+
"""
|
|
951
|
+
|
|
952
|
+
@staticmethod
|
|
953
|
+
def get_all_users() -> str:
|
|
954
|
+
return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
|
|
|
16
16
|
ClassificationHandler,
|
|
17
17
|
classification_workunit_processor,
|
|
18
18
|
)
|
|
19
|
+
from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
|
|
19
20
|
from datahub.ingestion.source.common.subtypes import (
|
|
20
21
|
DatasetContainerSubTypes,
|
|
21
22
|
DatasetSubTypes,
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
|
35
36
|
)
|
|
36
37
|
from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
|
|
37
38
|
from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
|
|
39
|
+
from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
|
|
38
40
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
39
41
|
from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
40
42
|
SCHEMA_PARALLELISM,
|
|
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
65
67
|
get_domain_wu,
|
|
66
68
|
)
|
|
67
69
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
70
|
+
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
68
71
|
METADATA_EXTRACTION,
|
|
69
72
|
PROFILING,
|
|
70
73
|
)
|
|
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
96
99
|
TimeType,
|
|
97
100
|
)
|
|
98
101
|
from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
|
|
99
|
-
from datahub.sql_parsing.sql_parsing_aggregator import
|
|
102
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
103
|
+
KnownLineageMapping,
|
|
104
|
+
SqlParsingAggregator,
|
|
105
|
+
)
|
|
100
106
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
101
107
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
102
108
|
|
|
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
180
186
|
|
|
181
187
|
# These are populated as side-effects of get_workunits_internal.
|
|
182
188
|
self.databases: List[SnowflakeDatabase] = []
|
|
183
|
-
|
|
189
|
+
|
|
190
|
+
self.aggregator = aggregator
|
|
184
191
|
|
|
185
192
|
def get_connection(self) -> SnowflakeConnection:
|
|
186
193
|
return self.connection
|
|
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
212
219
|
self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
|
|
213
220
|
yield from self._process_database(snowflake_db)
|
|
214
221
|
|
|
222
|
+
self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
|
|
223
|
+
discovered_tables: List[str] = [
|
|
224
|
+
self.identifiers.get_dataset_identifier(
|
|
225
|
+
table_name, schema.name, db.name
|
|
226
|
+
)
|
|
227
|
+
for db in self.databases
|
|
228
|
+
for schema in db.schemas
|
|
229
|
+
for table_name in schema.tables
|
|
230
|
+
]
|
|
231
|
+
if self.aggregator:
|
|
232
|
+
for entry in self._external_tables_ddl_lineage(discovered_tables):
|
|
233
|
+
self.aggregator.add(entry)
|
|
234
|
+
|
|
215
235
|
except SnowflakePermissionError as e:
|
|
216
236
|
self.structured_reporter.failure(
|
|
217
237
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
@@ -415,11 +435,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
415
435
|
)
|
|
416
436
|
|
|
417
437
|
if self.config.include_views:
|
|
418
|
-
if
|
|
419
|
-
self.aggregator
|
|
420
|
-
and self.config.include_view_lineage
|
|
421
|
-
and self.config.parse_view_ddl
|
|
422
|
-
):
|
|
438
|
+
if self.aggregator:
|
|
423
439
|
for view in views:
|
|
424
440
|
view_identifier = self.identifiers.get_dataset_identifier(
|
|
425
441
|
view.name, schema_name, db_name
|
|
@@ -1082,3 +1098,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1082
1098
|
|
|
1083
1099
|
# Access to table but none of its constraints - is this possible ?
|
|
1084
1100
|
return constraints.get(table_name, [])
|
|
1101
|
+
|
|
1102
|
+
# Handles the case for explicitly created external tables.
|
|
1103
|
+
# NOTE: Snowflake does not log this information to the access_history table.
|
|
1104
|
+
def _external_tables_ddl_lineage(
|
|
1105
|
+
self, discovered_tables: List[str]
|
|
1106
|
+
) -> Iterable[KnownLineageMapping]:
|
|
1107
|
+
external_tables_query: str = SnowflakeQuery.show_external_tables()
|
|
1108
|
+
try:
|
|
1109
|
+
for db_row in self.connection.query(external_tables_query):
|
|
1110
|
+
key = self.identifiers.get_dataset_identifier(
|
|
1111
|
+
db_row["name"], db_row["schema_name"], db_row["database_name"]
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
if key not in discovered_tables:
|
|
1115
|
+
continue
|
|
1116
|
+
if db_row["location"].startswith("s3://"):
|
|
1117
|
+
yield KnownLineageMapping(
|
|
1118
|
+
upstream_urn=make_s3_urn_for_lineage(
|
|
1119
|
+
db_row["location"], self.config.env
|
|
1120
|
+
),
|
|
1121
|
+
downstream_urn=self.identifiers.gen_dataset_urn(key),
|
|
1122
|
+
)
|
|
1123
|
+
self.report.num_external_table_edges_scanned += 1
|
|
1124
|
+
|
|
1125
|
+
self.report.num_external_table_edges_scanned += 1
|
|
1126
|
+
except Exception as e:
|
|
1127
|
+
self.structured_reporter.warning(
|
|
1128
|
+
"External table ddl lineage extraction failed",
|
|
1129
|
+
exc=e,
|
|
1130
|
+
)
|
|
@@ -72,7 +72,7 @@ class SnowflakeSharesHandler(SnowflakeCommonMixin):
|
|
|
72
72
|
assert len(sibling_dbs) == 1
|
|
73
73
|
# SnowflakeLineageExtractor is unaware of database->schema->table hierarchy
|
|
74
74
|
# hence this lineage code is not written in SnowflakeLineageExtractor
|
|
75
|
-
# also this is not governed by configs include_table_lineage
|
|
75
|
+
# also this is not governed by configs include_table_lineage
|
|
76
76
|
yield self.get_upstream_lineage_with_primary_sibling(
|
|
77
77
|
db.name, schema.name, table_name, sibling_dbs[0]
|
|
78
78
|
)
|
|
@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
342
342
|
filtered_user_counts.append(
|
|
343
343
|
DatasetUserUsageCounts(
|
|
344
344
|
user=make_user_urn(
|
|
345
|
-
self.get_user_identifier(
|
|
345
|
+
self.identifiers.get_user_identifier(
|
|
346
346
|
user_count["user_name"],
|
|
347
347
|
user_email,
|
|
348
|
-
self.config.email_as_user_identifier,
|
|
349
348
|
)
|
|
350
349
|
),
|
|
351
350
|
count=user_count["total"],
|
|
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
453
452
|
reported_time: int = int(time.time() * 1000)
|
|
454
453
|
last_updated_timestamp: int = int(start_time.timestamp() * 1000)
|
|
455
454
|
user_urn = make_user_urn(
|
|
456
|
-
self.get_user_identifier(
|
|
457
|
-
user_name, user_email, self.config.email_as_user_identifier
|
|
458
|
-
)
|
|
455
|
+
self.identifiers.get_user_identifier(user_name, user_email)
|
|
459
456
|
)
|
|
460
457
|
|
|
461
458
|
# NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect
|