acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import traceback
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from enum import Enum
|
|
7
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
6
|
+
from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
8
7
|
from urllib.parse import urlparse
|
|
9
8
|
|
|
10
|
-
import humanfriendly
|
|
11
9
|
import redshift_connector
|
|
12
10
|
import sqlglot
|
|
13
11
|
|
|
14
|
-
import datahub.emitter.mce_builder as builder
|
|
15
12
|
import datahub.sql_parsing.sqlglot_lineage as sqlglot_l
|
|
16
13
|
from datahub.emitter import mce_builder
|
|
17
14
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
15
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
18
16
|
from datahub.ingestion.api.common import PipelineContext
|
|
17
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
19
18
|
from datahub.ingestion.source.aws.s3_util import strip_s3_prefix
|
|
20
19
|
from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
|
|
21
20
|
from datahub.ingestion.source.redshift.query import (
|
|
@@ -35,30 +34,20 @@ from datahub.ingestion.source.redshift.report import RedshiftReport
|
|
|
35
34
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
36
35
|
RedundantLineageRunSkipHandler,
|
|
37
36
|
)
|
|
38
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
39
|
-
FineGrainedLineage,
|
|
40
|
-
FineGrainedLineageDownstreamType,
|
|
41
|
-
FineGrainedLineageUpstreamType,
|
|
42
|
-
UpstreamLineage,
|
|
43
|
-
)
|
|
44
|
-
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
45
|
-
OtherSchema,
|
|
46
|
-
SchemaField,
|
|
47
|
-
SchemaMetadata,
|
|
48
|
-
)
|
|
49
37
|
from datahub.metadata.schema_classes import (
|
|
50
38
|
DatasetLineageTypeClass,
|
|
51
|
-
UpstreamClass,
|
|
52
|
-
UpstreamLineageClass,
|
|
53
39
|
)
|
|
54
40
|
from datahub.metadata.urns import DatasetUrn
|
|
55
|
-
from datahub.sql_parsing.
|
|
56
|
-
|
|
41
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
42
|
+
KnownQueryLineageInfo,
|
|
43
|
+
ObservedQuery,
|
|
44
|
+
SqlParsingAggregator,
|
|
45
|
+
TableRename,
|
|
46
|
+
)
|
|
57
47
|
from datahub.sql_parsing.sqlglot_utils import get_dialect, parse_statement
|
|
58
|
-
from datahub.utilities import
|
|
59
|
-
from datahub.utilities.dedup_list import deduplicate_list
|
|
48
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
60
49
|
|
|
61
|
-
logger
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
62
51
|
|
|
63
52
|
|
|
64
53
|
class LineageDatasetPlatform(Enum):
|
|
@@ -100,30 +89,6 @@ class LineageItem:
|
|
|
100
89
|
else:
|
|
101
90
|
self.dataset_lineage_type = DatasetLineageTypeClass.TRANSFORMED
|
|
102
91
|
|
|
103
|
-
def merge_lineage(
|
|
104
|
-
self,
|
|
105
|
-
upstreams: Set[LineageDataset],
|
|
106
|
-
cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
|
|
107
|
-
) -> None:
|
|
108
|
-
self.upstreams = self.upstreams.union(upstreams)
|
|
109
|
-
|
|
110
|
-
# Merge CLL using the output column name as the merge key.
|
|
111
|
-
self.cll = self.cll or []
|
|
112
|
-
existing_cll: Dict[str, sqlglot_l.ColumnLineageInfo] = {
|
|
113
|
-
c.downstream.column: c for c in self.cll
|
|
114
|
-
}
|
|
115
|
-
for c in cll or []:
|
|
116
|
-
if c.downstream.column in existing_cll:
|
|
117
|
-
# Merge using upstream + column name as the merge key.
|
|
118
|
-
existing_cll[c.downstream.column].upstreams = deduplicate_list(
|
|
119
|
-
[*existing_cll[c.downstream.column].upstreams, *c.upstreams]
|
|
120
|
-
)
|
|
121
|
-
else:
|
|
122
|
-
# New output column, just add it as is.
|
|
123
|
-
self.cll.append(c)
|
|
124
|
-
|
|
125
|
-
self.cll = self.cll or None
|
|
126
|
-
|
|
127
92
|
|
|
128
93
|
def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str, str]:
|
|
129
94
|
"""
|
|
@@ -142,117 +107,48 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str,
|
|
|
142
107
|
return schema, prev_name, new_name
|
|
143
108
|
|
|
144
109
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# -3 because platform instance is optional and that can cause the split to have more than 3 elements
|
|
149
|
-
db, schema, table = qualified_table_name.split(".")[-3:]
|
|
110
|
+
class RedshiftSqlLineage(Closeable):
|
|
111
|
+
# does lineage and usage based on SQL parsing.
|
|
150
112
|
|
|
151
|
-
return db, schema, table
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
class RedshiftLineageExtractor:
|
|
155
113
|
def __init__(
|
|
156
114
|
self,
|
|
157
115
|
config: RedshiftConfig,
|
|
158
116
|
report: RedshiftReport,
|
|
159
117
|
context: PipelineContext,
|
|
118
|
+
database: str,
|
|
160
119
|
redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
|
|
161
120
|
):
|
|
121
|
+
self.platform = "redshift"
|
|
162
122
|
self.config = config
|
|
163
123
|
self.report = report
|
|
164
124
|
self.context = context
|
|
165
|
-
self.
|
|
125
|
+
self.database = database
|
|
126
|
+
self.known_urns: Set[str] = set() # will be set later
|
|
127
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
128
|
+
|
|
129
|
+
self.aggregator = SqlParsingAggregator(
|
|
130
|
+
platform=self.platform,
|
|
131
|
+
platform_instance=self.config.platform_instance,
|
|
132
|
+
env=self.config.env,
|
|
133
|
+
generate_lineage=True,
|
|
134
|
+
generate_queries=self.config.lineage_generate_queries,
|
|
135
|
+
generate_usage_statistics=False,
|
|
136
|
+
generate_operations=False,
|
|
137
|
+
usage_config=self.config,
|
|
138
|
+
graph=self.context.graph,
|
|
139
|
+
is_temp_table=self._is_temp_table,
|
|
140
|
+
)
|
|
141
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
166
142
|
|
|
167
143
|
self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
|
|
168
144
|
if self.config.is_serverless:
|
|
169
145
|
self.queries = RedshiftServerlessQuery()
|
|
170
146
|
|
|
171
|
-
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
172
147
|
self.start_time, self.end_time = (
|
|
173
148
|
self.report.lineage_start_time,
|
|
174
149
|
self.report.lineage_end_time,
|
|
175
150
|
) = self.get_time_window()
|
|
176
151
|
|
|
177
|
-
self.temp_tables: Dict[str, TempTableRow] = {}
|
|
178
|
-
|
|
179
|
-
def _init_temp_table_schema(
|
|
180
|
-
self, database: str, temp_tables: List[TempTableRow]
|
|
181
|
-
) -> None:
|
|
182
|
-
if self.context.graph is None: # to silent lint
|
|
183
|
-
return
|
|
184
|
-
|
|
185
|
-
schema_resolver: SchemaResolver = self.context.graph._make_schema_resolver(
|
|
186
|
-
platform=LineageDatasetPlatform.REDSHIFT.value,
|
|
187
|
-
platform_instance=self.config.platform_instance,
|
|
188
|
-
env=self.config.env,
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
dataset_vs_columns: Dict[str, List[SchemaField]] = {}
|
|
192
|
-
# prepare dataset_urn vs List of schema fields
|
|
193
|
-
for table in temp_tables:
|
|
194
|
-
logger.debug(
|
|
195
|
-
f"Processing temp table: {table.create_command} with query text {table.query_text}"
|
|
196
|
-
)
|
|
197
|
-
result = sqlglot_l.create_lineage_sql_parsed_result(
|
|
198
|
-
platform=LineageDatasetPlatform.REDSHIFT.value,
|
|
199
|
-
platform_instance=self.config.platform_instance,
|
|
200
|
-
env=self.config.env,
|
|
201
|
-
default_db=database,
|
|
202
|
-
default_schema=self.config.default_schema,
|
|
203
|
-
query=table.query_text,
|
|
204
|
-
graph=self.context.graph,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
if (
|
|
208
|
-
result is None
|
|
209
|
-
or result.column_lineage is None
|
|
210
|
-
or not result.query_type.is_create()
|
|
211
|
-
or not result.out_tables
|
|
212
|
-
):
|
|
213
|
-
logger.debug(f"Unsupported temp table query found: {table.query_text}")
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
table.parsed_result = result
|
|
217
|
-
if result.column_lineage[0].downstream.table:
|
|
218
|
-
table.urn = result.column_lineage[0].downstream.table
|
|
219
|
-
|
|
220
|
-
self.temp_tables[result.out_tables[0]] = table
|
|
221
|
-
|
|
222
|
-
for table in self.temp_tables.values():
|
|
223
|
-
if (
|
|
224
|
-
table.parsed_result is None
|
|
225
|
-
or table.urn is None
|
|
226
|
-
or table.parsed_result.column_lineage is None
|
|
227
|
-
):
|
|
228
|
-
continue
|
|
229
|
-
|
|
230
|
-
# Initialise the temp table urn, we later need this to merge CLL
|
|
231
|
-
downstream_urn = table.urn
|
|
232
|
-
if downstream_urn not in dataset_vs_columns:
|
|
233
|
-
dataset_vs_columns[downstream_urn] = []
|
|
234
|
-
dataset_vs_columns[downstream_urn].extend(
|
|
235
|
-
sqlglot_l.infer_output_schema(table.parsed_result) or []
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
# Add datasets, and it's respective fields in schema_resolver, so that later schema_resolver would be able
|
|
239
|
-
# correctly generates the upstreams for temporary tables
|
|
240
|
-
for urn in dataset_vs_columns:
|
|
241
|
-
db, schema, table_name = split_qualified_table_name(urn)
|
|
242
|
-
schema_resolver.add_schema_metadata(
|
|
243
|
-
urn=urn,
|
|
244
|
-
schema_metadata=SchemaMetadata(
|
|
245
|
-
schemaName=table_name,
|
|
246
|
-
platform=builder.make_data_platform_urn(
|
|
247
|
-
LineageDatasetPlatform.REDSHIFT.value
|
|
248
|
-
),
|
|
249
|
-
version=0,
|
|
250
|
-
hash="",
|
|
251
|
-
platformSchema=OtherSchema(rawSchema=""),
|
|
252
|
-
fields=dataset_vs_columns[urn],
|
|
253
|
-
),
|
|
254
|
-
)
|
|
255
|
-
|
|
256
152
|
def get_time_window(self) -> Tuple[datetime, datetime]:
|
|
257
153
|
if self.redundant_run_skip_handler:
|
|
258
154
|
self.report.stateful_lineage_ingestion_enabled = True
|
|
@@ -262,9 +158,20 @@ class RedshiftLineageExtractor:
|
|
|
262
158
|
else:
|
|
263
159
|
return self.config.start_time, self.config.end_time
|
|
264
160
|
|
|
265
|
-
def
|
|
266
|
-
|
|
267
|
-
|
|
161
|
+
def report_status(self, step: str, status: bool) -> None:
|
|
162
|
+
if self.redundant_run_skip_handler:
|
|
163
|
+
self.redundant_run_skip_handler.report_current_run_status(step, status)
|
|
164
|
+
|
|
165
|
+
def _is_temp_table(self, name: str) -> bool:
|
|
166
|
+
return (
|
|
167
|
+
DatasetUrn.create_from_ids(
|
|
168
|
+
self.platform,
|
|
169
|
+
name,
|
|
170
|
+
env=self.config.env,
|
|
171
|
+
platform_instance=self.config.platform_instance,
|
|
172
|
+
).urn()
|
|
173
|
+
not in self.known_urns
|
|
174
|
+
)
|
|
268
175
|
|
|
269
176
|
def _get_s3_path(self, path: str) -> Optional[str]:
|
|
270
177
|
if self.config.s3_lineage_config:
|
|
@@ -289,6 +196,15 @@ class RedshiftLineageExtractor:
|
|
|
289
196
|
|
|
290
197
|
return path
|
|
291
198
|
|
|
199
|
+
def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
|
|
200
|
+
path = filename.strip()
|
|
201
|
+
if urlparse(path).scheme != "s3":
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"Only s3 source supported with copy/unload. The source was: {path}"
|
|
204
|
+
)
|
|
205
|
+
s3_path = self._get_s3_path(path)
|
|
206
|
+
return strip_s3_prefix(s3_path) if s3_path else None
|
|
207
|
+
|
|
292
208
|
def _get_sources_from_query(
|
|
293
209
|
self,
|
|
294
210
|
db_name: str,
|
|
@@ -335,15 +251,6 @@ class RedshiftLineageExtractor:
|
|
|
335
251
|
),
|
|
336
252
|
)
|
|
337
253
|
|
|
338
|
-
def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
|
|
339
|
-
path = filename.strip()
|
|
340
|
-
if urlparse(path).scheme != "s3":
|
|
341
|
-
raise ValueError(
|
|
342
|
-
f"Only s3 source supported with copy/unload. The source was: {path}"
|
|
343
|
-
)
|
|
344
|
-
s3_path = self._get_s3_path(path)
|
|
345
|
-
return strip_s3_prefix(s3_path) if s3_path else None
|
|
346
|
-
|
|
347
254
|
def _get_sources(
|
|
348
255
|
self,
|
|
349
256
|
lineage_type: LineageCollectorType,
|
|
@@ -418,112 +325,6 @@ class RedshiftLineageExtractor:
|
|
|
418
325
|
|
|
419
326
|
return sources, cll
|
|
420
327
|
|
|
421
|
-
def _populate_lineage_map(
|
|
422
|
-
self,
|
|
423
|
-
query: str,
|
|
424
|
-
database: str,
|
|
425
|
-
lineage_type: LineageCollectorType,
|
|
426
|
-
connection: redshift_connector.Connection,
|
|
427
|
-
all_tables_set: Dict[str, Dict[str, Set[str]]],
|
|
428
|
-
) -> None:
|
|
429
|
-
"""
|
|
430
|
-
This method generate table level lineage based with the given query.
|
|
431
|
-
The query should return the following columns: target_schema, target_table, source_table, source_schema
|
|
432
|
-
source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table
|
|
433
|
-
and source_schema will be extracted from the sql_field by sql parsing.
|
|
434
|
-
|
|
435
|
-
:param query: The query to run to extract lineage.
|
|
436
|
-
:type query: str
|
|
437
|
-
:param lineage_type: The way the lineage should be processed
|
|
438
|
-
:type lineage_type: LineageType
|
|
439
|
-
return: The method does not return with anything as it directly modify the self._lineage_map property.
|
|
440
|
-
:rtype: None
|
|
441
|
-
"""
|
|
442
|
-
|
|
443
|
-
logger.info(f"Extracting {lineage_type.name} lineage for db {database}")
|
|
444
|
-
try:
|
|
445
|
-
logger.debug(f"Processing lineage query: {query}")
|
|
446
|
-
cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None
|
|
447
|
-
raw_db_name = database
|
|
448
|
-
alias_db_name = self.config.database
|
|
449
|
-
|
|
450
|
-
for lineage_row in RedshiftDataDictionary.get_lineage_rows(
|
|
451
|
-
conn=connection, query=query
|
|
452
|
-
):
|
|
453
|
-
target = self._get_target_lineage(
|
|
454
|
-
alias_db_name,
|
|
455
|
-
lineage_row,
|
|
456
|
-
lineage_type,
|
|
457
|
-
all_tables_set=all_tables_set,
|
|
458
|
-
)
|
|
459
|
-
if not target:
|
|
460
|
-
continue
|
|
461
|
-
|
|
462
|
-
logger.debug(
|
|
463
|
-
f"Processing {lineage_type.name} lineage row: {lineage_row}"
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
sources, cll = self._get_sources(
|
|
467
|
-
lineage_type,
|
|
468
|
-
alias_db_name,
|
|
469
|
-
source_schema=lineage_row.source_schema,
|
|
470
|
-
source_table=lineage_row.source_table,
|
|
471
|
-
ddl=lineage_row.ddl,
|
|
472
|
-
filename=lineage_row.filename,
|
|
473
|
-
)
|
|
474
|
-
|
|
475
|
-
target.upstreams.update(
|
|
476
|
-
self._get_upstream_lineages(
|
|
477
|
-
sources=sources,
|
|
478
|
-
target_table=target.dataset.urn,
|
|
479
|
-
target_dataset_cll=cll,
|
|
480
|
-
all_tables_set=all_tables_set,
|
|
481
|
-
alias_db_name=alias_db_name,
|
|
482
|
-
raw_db_name=raw_db_name,
|
|
483
|
-
connection=connection,
|
|
484
|
-
)
|
|
485
|
-
)
|
|
486
|
-
target.cll = cll
|
|
487
|
-
|
|
488
|
-
# Merging upstreams if dataset already exists and has upstreams
|
|
489
|
-
if target.dataset.urn in self._lineage_map:
|
|
490
|
-
self._lineage_map[target.dataset.urn].merge_lineage(
|
|
491
|
-
upstreams=target.upstreams, cll=target.cll
|
|
492
|
-
)
|
|
493
|
-
else:
|
|
494
|
-
self._lineage_map[target.dataset.urn] = target
|
|
495
|
-
|
|
496
|
-
logger.debug(
|
|
497
|
-
f"Lineage[{target}]:{self._lineage_map[target.dataset.urn]}"
|
|
498
|
-
)
|
|
499
|
-
except Exception as e:
|
|
500
|
-
self.warn(
|
|
501
|
-
logger,
|
|
502
|
-
f"extract-{lineage_type.name}",
|
|
503
|
-
f"Error was {e}, {traceback.format_exc()}",
|
|
504
|
-
)
|
|
505
|
-
self.report_status(f"extract-{lineage_type.name}", False)
|
|
506
|
-
|
|
507
|
-
def _update_lineage_map_for_table_renames(
|
|
508
|
-
self, table_renames: Dict[str, TableRename]
|
|
509
|
-
) -> None:
|
|
510
|
-
if not table_renames:
|
|
511
|
-
return
|
|
512
|
-
|
|
513
|
-
logger.info(f"Updating lineage map for {len(table_renames)} table renames")
|
|
514
|
-
for entry in table_renames.values():
|
|
515
|
-
# This table was renamed from some other name, copy in the lineage
|
|
516
|
-
# for the previous name as well.
|
|
517
|
-
prev_table_lineage = self._lineage_map.get(entry.original_urn)
|
|
518
|
-
if prev_table_lineage:
|
|
519
|
-
logger.debug(
|
|
520
|
-
f"including lineage for {entry.original_urn} in {entry.new_urn} due to table rename"
|
|
521
|
-
)
|
|
522
|
-
self._lineage_map[entry.new_urn].merge_lineage(
|
|
523
|
-
upstreams=prev_table_lineage.upstreams,
|
|
524
|
-
cll=prev_table_lineage.cll,
|
|
525
|
-
)
|
|
526
|
-
|
|
527
328
|
def _get_target_lineage(
|
|
528
329
|
self,
|
|
529
330
|
alias_db_name: str,
|
|
@@ -569,7 +370,7 @@ class RedshiftLineageExtractor:
|
|
|
569
370
|
),
|
|
570
371
|
)
|
|
571
372
|
except ValueError as e:
|
|
572
|
-
self.
|
|
373
|
+
self.report.warning("non-s3-lineage", str(e))
|
|
573
374
|
return None
|
|
574
375
|
else:
|
|
575
376
|
target_platform = LineageDatasetPlatform.REDSHIFT
|
|
@@ -588,269 +389,6 @@ class RedshiftLineageExtractor:
|
|
|
588
389
|
cll=None,
|
|
589
390
|
)
|
|
590
391
|
|
|
591
|
-
def _get_upstream_lineages(
|
|
592
|
-
self,
|
|
593
|
-
sources: List[LineageDataset],
|
|
594
|
-
target_table: str,
|
|
595
|
-
all_tables_set: Dict[str, Dict[str, Set[str]]],
|
|
596
|
-
alias_db_name: str,
|
|
597
|
-
raw_db_name: str,
|
|
598
|
-
connection: redshift_connector.Connection,
|
|
599
|
-
target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
|
|
600
|
-
) -> List[LineageDataset]:
|
|
601
|
-
target_source = []
|
|
602
|
-
probable_temp_tables: List[str] = []
|
|
603
|
-
|
|
604
|
-
for source in sources:
|
|
605
|
-
if source.platform == LineageDatasetPlatform.REDSHIFT:
|
|
606
|
-
db, schema, table = split_qualified_table_name(source.urn)
|
|
607
|
-
if db == raw_db_name:
|
|
608
|
-
db = alias_db_name
|
|
609
|
-
path = f"{db}.{schema}.{table}"
|
|
610
|
-
source = LineageDataset(
|
|
611
|
-
platform=source.platform,
|
|
612
|
-
urn=make_dataset_urn_with_platform_instance(
|
|
613
|
-
platform=LineageDatasetPlatform.REDSHIFT.value,
|
|
614
|
-
platform_instance=self.config.platform_instance,
|
|
615
|
-
name=path,
|
|
616
|
-
env=self.config.env,
|
|
617
|
-
),
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
# Filtering out tables which does not exist in Redshift
|
|
621
|
-
# It was deleted in the meantime or query parser did not capture well the table name
|
|
622
|
-
# Or it might be a temp table
|
|
623
|
-
if (
|
|
624
|
-
db not in all_tables_set
|
|
625
|
-
or schema not in all_tables_set[db]
|
|
626
|
-
or table not in all_tables_set[db][schema]
|
|
627
|
-
):
|
|
628
|
-
logger.debug(
|
|
629
|
-
f"{source.urn} missing table. Adding it to temp table list for target table {target_table}.",
|
|
630
|
-
)
|
|
631
|
-
probable_temp_tables.append(f"{schema}.{table}")
|
|
632
|
-
self.report.num_lineage_tables_dropped += 1
|
|
633
|
-
continue
|
|
634
|
-
|
|
635
|
-
target_source.append(source)
|
|
636
|
-
|
|
637
|
-
if probable_temp_tables and self.config.resolve_temp_table_in_lineage:
|
|
638
|
-
self.report.num_lineage_processed_temp_tables += len(probable_temp_tables)
|
|
639
|
-
# Generate lineage dataset from temporary tables
|
|
640
|
-
number_of_permanent_dataset_found: int = (
|
|
641
|
-
self.update_table_and_column_lineage(
|
|
642
|
-
db_name=raw_db_name,
|
|
643
|
-
connection=connection,
|
|
644
|
-
temp_table_names=probable_temp_tables,
|
|
645
|
-
target_source_dataset=target_source,
|
|
646
|
-
target_dataset_cll=target_dataset_cll,
|
|
647
|
-
)
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
logger.debug(
|
|
651
|
-
f"Number of permanent datasets found for {target_table} = {number_of_permanent_dataset_found} in "
|
|
652
|
-
f"temp tables {probable_temp_tables}"
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
return target_source
|
|
656
|
-
|
|
657
|
-
def populate_lineage(
|
|
658
|
-
self,
|
|
659
|
-
database: str,
|
|
660
|
-
connection: redshift_connector.Connection,
|
|
661
|
-
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
662
|
-
) -> None:
|
|
663
|
-
if self.config.resolve_temp_table_in_lineage:
|
|
664
|
-
self._init_temp_table_schema(
|
|
665
|
-
database=database,
|
|
666
|
-
temp_tables=list(self.get_temp_tables(connection=connection)),
|
|
667
|
-
)
|
|
668
|
-
|
|
669
|
-
populate_calls: List[Tuple[str, LineageCollectorType]] = []
|
|
670
|
-
|
|
671
|
-
all_tables_set: Dict[str, Dict[str, Set[str]]] = {
|
|
672
|
-
db: {schema: {t.name for t in tables} for schema, tables in schemas.items()}
|
|
673
|
-
for db, schemas in all_tables.items()
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
table_renames: Dict[str, TableRename] = {}
|
|
677
|
-
if self.config.include_table_rename_lineage:
|
|
678
|
-
table_renames, all_tables_set = self._process_table_renames(
|
|
679
|
-
database=database,
|
|
680
|
-
connection=connection,
|
|
681
|
-
all_tables=all_tables_set,
|
|
682
|
-
)
|
|
683
|
-
|
|
684
|
-
if self.config.table_lineage_mode in {
|
|
685
|
-
LineageMode.STL_SCAN_BASED,
|
|
686
|
-
LineageMode.MIXED,
|
|
687
|
-
}:
|
|
688
|
-
# Populate table level lineage by getting upstream tables from stl_scan redshift table
|
|
689
|
-
query = self.queries.stl_scan_based_lineage_query(
|
|
690
|
-
self.config.database,
|
|
691
|
-
self.start_time,
|
|
692
|
-
self.end_time,
|
|
693
|
-
)
|
|
694
|
-
populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
|
|
695
|
-
if self.config.table_lineage_mode in {
|
|
696
|
-
LineageMode.SQL_BASED,
|
|
697
|
-
LineageMode.MIXED,
|
|
698
|
-
}:
|
|
699
|
-
# Populate table level lineage by parsing table creating sqls
|
|
700
|
-
query = self.queries.list_insert_create_queries_sql(
|
|
701
|
-
db_name=database,
|
|
702
|
-
start_time=self.start_time,
|
|
703
|
-
end_time=self.end_time,
|
|
704
|
-
)
|
|
705
|
-
populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER))
|
|
706
|
-
|
|
707
|
-
if self.config.include_views and self.config.include_view_lineage:
|
|
708
|
-
# Populate table level lineage for views
|
|
709
|
-
query = self.queries.view_lineage_query()
|
|
710
|
-
populate_calls.append((query, LineageCollectorType.VIEW))
|
|
711
|
-
|
|
712
|
-
# Populate table level lineage for late binding views
|
|
713
|
-
query = self.queries.list_late_view_ddls_query()
|
|
714
|
-
populate_calls.append((query, LineageCollectorType.VIEW_DDL_SQL_PARSING))
|
|
715
|
-
|
|
716
|
-
if self.config.include_copy_lineage:
|
|
717
|
-
query = self.queries.list_copy_commands_sql(
|
|
718
|
-
db_name=database,
|
|
719
|
-
start_time=self.start_time,
|
|
720
|
-
end_time=self.end_time,
|
|
721
|
-
)
|
|
722
|
-
populate_calls.append((query, LineageCollectorType.COPY))
|
|
723
|
-
|
|
724
|
-
if self.config.include_unload_lineage:
|
|
725
|
-
query = self.queries.list_unload_commands_sql(
|
|
726
|
-
db_name=database,
|
|
727
|
-
start_time=self.start_time,
|
|
728
|
-
end_time=self.end_time,
|
|
729
|
-
)
|
|
730
|
-
|
|
731
|
-
populate_calls.append((query, LineageCollectorType.UNLOAD))
|
|
732
|
-
|
|
733
|
-
for query, lineage_type in populate_calls:
|
|
734
|
-
self._populate_lineage_map(
|
|
735
|
-
query=query,
|
|
736
|
-
database=database,
|
|
737
|
-
lineage_type=lineage_type,
|
|
738
|
-
connection=connection,
|
|
739
|
-
all_tables_set=all_tables_set,
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
# Handling for alter table statements.
|
|
743
|
-
self._update_lineage_map_for_table_renames(table_renames=table_renames)
|
|
744
|
-
|
|
745
|
-
self.report.lineage_mem_size[self.config.database] = humanfriendly.format_size(
|
|
746
|
-
memory_footprint.total_size(self._lineage_map)
|
|
747
|
-
)
|
|
748
|
-
|
|
749
|
-
def make_fine_grained_lineage_class(
|
|
750
|
-
self, lineage_item: LineageItem, dataset_urn: str
|
|
751
|
-
) -> List[FineGrainedLineage]:
|
|
752
|
-
fine_grained_lineages: List[FineGrainedLineage] = []
|
|
753
|
-
|
|
754
|
-
if (
|
|
755
|
-
self.config.extract_column_level_lineage is False
|
|
756
|
-
or lineage_item.cll is None
|
|
757
|
-
):
|
|
758
|
-
logger.debug("CLL extraction is disabled")
|
|
759
|
-
return fine_grained_lineages
|
|
760
|
-
|
|
761
|
-
logger.debug("Extracting column level lineage")
|
|
762
|
-
|
|
763
|
-
cll: List[sqlglot_l.ColumnLineageInfo] = lineage_item.cll
|
|
764
|
-
|
|
765
|
-
for cll_info in cll:
|
|
766
|
-
downstream = (
|
|
767
|
-
[builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)]
|
|
768
|
-
if cll_info.downstream is not None
|
|
769
|
-
and cll_info.downstream.column is not None
|
|
770
|
-
else []
|
|
771
|
-
)
|
|
772
|
-
|
|
773
|
-
upstreams = [
|
|
774
|
-
builder.make_schema_field_urn(column_ref.table, column_ref.column)
|
|
775
|
-
for column_ref in cll_info.upstreams
|
|
776
|
-
]
|
|
777
|
-
|
|
778
|
-
fine_grained_lineages.append(
|
|
779
|
-
FineGrainedLineage(
|
|
780
|
-
downstreamType=FineGrainedLineageDownstreamType.FIELD,
|
|
781
|
-
downstreams=downstream,
|
|
782
|
-
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
|
|
783
|
-
upstreams=upstreams,
|
|
784
|
-
)
|
|
785
|
-
)
|
|
786
|
-
|
|
787
|
-
logger.debug(f"Created fine_grained_lineage for {dataset_urn}")
|
|
788
|
-
|
|
789
|
-
return fine_grained_lineages
|
|
790
|
-
|
|
791
|
-
def get_lineage(
|
|
792
|
-
self,
|
|
793
|
-
table: Union[RedshiftTable, RedshiftView],
|
|
794
|
-
dataset_urn: str,
|
|
795
|
-
schema: RedshiftSchema,
|
|
796
|
-
) -> Optional[UpstreamLineageClass]:
|
|
797
|
-
upstream_lineage: List[UpstreamClass] = []
|
|
798
|
-
|
|
799
|
-
cll_lineage: List[FineGrainedLineage] = []
|
|
800
|
-
|
|
801
|
-
if dataset_urn in self._lineage_map:
|
|
802
|
-
item = self._lineage_map[dataset_urn]
|
|
803
|
-
for upstream in item.upstreams:
|
|
804
|
-
upstream_table = UpstreamClass(
|
|
805
|
-
dataset=upstream.urn,
|
|
806
|
-
type=item.dataset_lineage_type,
|
|
807
|
-
)
|
|
808
|
-
upstream_lineage.append(upstream_table)
|
|
809
|
-
|
|
810
|
-
cll_lineage = self.make_fine_grained_lineage_class(
|
|
811
|
-
lineage_item=item,
|
|
812
|
-
dataset_urn=dataset_urn,
|
|
813
|
-
)
|
|
814
|
-
|
|
815
|
-
tablename = table.name
|
|
816
|
-
if (
|
|
817
|
-
table.is_external_table()
|
|
818
|
-
and schema.is_external_schema()
|
|
819
|
-
and schema.external_platform
|
|
820
|
-
):
|
|
821
|
-
# external_db_params = schema.option
|
|
822
|
-
upstream_platform = schema.external_platform.lower()
|
|
823
|
-
catalog_upstream = UpstreamClass(
|
|
824
|
-
mce_builder.make_dataset_urn_with_platform_instance(
|
|
825
|
-
upstream_platform,
|
|
826
|
-
f"{schema.external_database}.{tablename}",
|
|
827
|
-
platform_instance=(
|
|
828
|
-
self.config.platform_instance_map.get(upstream_platform)
|
|
829
|
-
if self.config.platform_instance_map
|
|
830
|
-
else None
|
|
831
|
-
),
|
|
832
|
-
env=self.config.env,
|
|
833
|
-
),
|
|
834
|
-
DatasetLineageTypeClass.COPY,
|
|
835
|
-
)
|
|
836
|
-
upstream_lineage.append(catalog_upstream)
|
|
837
|
-
|
|
838
|
-
if upstream_lineage:
|
|
839
|
-
self.report.upstream_lineage[dataset_urn] = [
|
|
840
|
-
u.dataset for u in upstream_lineage
|
|
841
|
-
]
|
|
842
|
-
else:
|
|
843
|
-
return None
|
|
844
|
-
|
|
845
|
-
return UpstreamLineage(
|
|
846
|
-
upstreams=upstream_lineage,
|
|
847
|
-
fineGrainedLineages=cll_lineage or None,
|
|
848
|
-
)
|
|
849
|
-
|
|
850
|
-
def report_status(self, step: str, status: bool) -> None:
|
|
851
|
-
if self.redundant_run_skip_handler:
|
|
852
|
-
self.redundant_run_skip_handler.report_current_run_status(step, status)
|
|
853
|
-
|
|
854
392
|
def _process_table_renames(
|
|
855
393
|
self,
|
|
856
394
|
database: str,
|
|
@@ -924,204 +462,365 @@ class RedshiftLineageExtractor:
|
|
|
924
462
|
):
|
|
925
463
|
yield row
|
|
926
464
|
|
|
927
|
-
def
|
|
928
|
-
self,
|
|
929
|
-
|
|
930
|
-
|
|
465
|
+
def build(
|
|
466
|
+
self,
|
|
467
|
+
connection: redshift_connector.Connection,
|
|
468
|
+
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
469
|
+
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
470
|
+
) -> None:
|
|
471
|
+
# Assume things not in `all_tables` as temp tables.
|
|
472
|
+
self.known_urns = {
|
|
473
|
+
DatasetUrn.create_from_ids(
|
|
474
|
+
self.platform,
|
|
475
|
+
f"{db}.{schema}.{table.name}",
|
|
476
|
+
env=self.config.env,
|
|
477
|
+
platform_instance=self.config.platform_instance,
|
|
478
|
+
).urn()
|
|
479
|
+
for db, schemas in all_tables.items()
|
|
480
|
+
for schema, tables in schemas.items()
|
|
481
|
+
for table in tables
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
# Handle all the temp tables up front.
|
|
485
|
+
if self.config.resolve_temp_table_in_lineage:
|
|
486
|
+
for temp_row in self.get_temp_tables(connection=connection):
|
|
487
|
+
self.aggregator.add_observed_query(
|
|
488
|
+
ObservedQuery(
|
|
489
|
+
query=temp_row.query_text,
|
|
490
|
+
default_db=self.database,
|
|
491
|
+
default_schema=self.config.default_schema,
|
|
492
|
+
session_id=temp_row.session_id,
|
|
493
|
+
timestamp=temp_row.start_time,
|
|
494
|
+
),
|
|
495
|
+
# The "temp table" query actually returns all CREATE TABLE statements, even if they
|
|
496
|
+
# aren't explicitly a temp table. As such, setting is_known_temp_table=True
|
|
497
|
+
# would not be correct. We already have mechanisms to autodetect temp tables,
|
|
498
|
+
# so we won't lose anything by not setting it.
|
|
499
|
+
is_known_temp_table=False,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
|
|
931
503
|
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
self.
|
|
504
|
+
if self.config.include_table_rename_lineage:
|
|
505
|
+
# Process all the ALTER TABLE RENAME statements
|
|
506
|
+
table_renames, _ = self._process_table_renames(
|
|
507
|
+
database=self.database,
|
|
508
|
+
connection=connection,
|
|
509
|
+
all_tables=defaultdict(lambda: defaultdict(set)),
|
|
936
510
|
)
|
|
511
|
+
for entry in table_renames.values():
|
|
512
|
+
self.aggregator.add_table_rename(entry)
|
|
937
513
|
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
514
|
+
if self.config.table_lineage_mode in {
|
|
515
|
+
LineageMode.SQL_BASED,
|
|
516
|
+
LineageMode.MIXED,
|
|
517
|
+
}:
|
|
518
|
+
# Populate lineage by parsing table creating sqls
|
|
519
|
+
query = self.queries.list_insert_create_queries_sql(
|
|
520
|
+
db_name=self.database,
|
|
521
|
+
start_time=self.start_time,
|
|
522
|
+
end_time=self.end_time,
|
|
523
|
+
)
|
|
524
|
+
populate_calls.append(
|
|
525
|
+
(
|
|
526
|
+
LineageCollectorType.QUERY_SQL_PARSER,
|
|
527
|
+
query,
|
|
528
|
+
self._process_sql_parser_lineage,
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
if self.config.table_lineage_mode in {
|
|
532
|
+
LineageMode.STL_SCAN_BASED,
|
|
533
|
+
LineageMode.MIXED,
|
|
534
|
+
}:
|
|
535
|
+
# Populate lineage by getting upstream tables from stl_scan redshift table
|
|
536
|
+
query = self.queries.stl_scan_based_lineage_query(
|
|
537
|
+
self.database,
|
|
538
|
+
self.start_time,
|
|
539
|
+
self.end_time,
|
|
540
|
+
)
|
|
541
|
+
populate_calls.append(
|
|
542
|
+
(LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
|
|
543
|
+
)
|
|
943
544
|
|
|
944
|
-
|
|
545
|
+
if self.config.include_views and self.config.include_view_lineage:
|
|
546
|
+
# Populate lineage for views
|
|
547
|
+
query = self.queries.view_lineage_query()
|
|
548
|
+
populate_calls.append(
|
|
549
|
+
(LineageCollectorType.VIEW, query, self._process_view_lineage)
|
|
550
|
+
)
|
|
945
551
|
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
552
|
+
# Populate lineage for late binding views
|
|
553
|
+
query = self.queries.list_late_view_ddls_query()
|
|
554
|
+
populate_calls.append(
|
|
555
|
+
(
|
|
556
|
+
LineageCollectorType.VIEW_DDL_SQL_PARSING,
|
|
557
|
+
query,
|
|
558
|
+
self._process_view_lineage,
|
|
559
|
+
)
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
if self.config.include_copy_lineage:
|
|
563
|
+
# Populate lineage for copy commands.
|
|
564
|
+
query = self.queries.list_copy_commands_sql(
|
|
565
|
+
db_name=self.database,
|
|
566
|
+
start_time=self.start_time,
|
|
567
|
+
end_time=self.end_time,
|
|
568
|
+
)
|
|
569
|
+
populate_calls.append(
|
|
570
|
+
(LineageCollectorType.COPY, query, self._process_copy_command)
|
|
571
|
+
)
|
|
955
572
|
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
573
|
+
if self.config.include_unload_lineage:
|
|
574
|
+
# Populate lineage for unload commands.
|
|
575
|
+
query = self.queries.list_unload_commands_sql(
|
|
576
|
+
db_name=self.database,
|
|
577
|
+
start_time=self.start_time,
|
|
578
|
+
end_time=self.end_time,
|
|
579
|
+
)
|
|
580
|
+
populate_calls.append(
|
|
581
|
+
(LineageCollectorType.UNLOAD, query, self._process_unload_command)
|
|
582
|
+
)
|
|
959
583
|
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
584
|
+
for lineage_type, query, processor in populate_calls:
|
|
585
|
+
self._populate_lineage_agg(
|
|
586
|
+
query=query,
|
|
587
|
+
lineage_type=lineage_type,
|
|
588
|
+
processor=processor,
|
|
589
|
+
connection=connection,
|
|
963
590
|
)
|
|
964
|
-
self.report.num_unresolved_temp_columns += 1
|
|
965
|
-
return column_refs
|
|
966
|
-
|
|
967
|
-
for ref in column_refs:
|
|
968
|
-
resolved = False
|
|
969
|
-
if ref.table in self.temp_tables:
|
|
970
|
-
table = self.temp_tables[ref.table]
|
|
971
|
-
if table.parsed_result and table.parsed_result.column_lineage:
|
|
972
|
-
for column_lineage in table.parsed_result.column_lineage:
|
|
973
|
-
if (
|
|
974
|
-
column_lineage.downstream.table == ref.table
|
|
975
|
-
and column_lineage.downstream.column == ref.column
|
|
976
|
-
):
|
|
977
|
-
resolved_column_refs.extend(
|
|
978
|
-
self.resolve_column_refs(
|
|
979
|
-
column_lineage.upstreams, depth=depth + 1
|
|
980
|
-
)
|
|
981
|
-
)
|
|
982
|
-
resolved = True
|
|
983
|
-
break
|
|
984
|
-
# If we reach here, it means that we were not able to resolve the column reference.
|
|
985
|
-
if resolved is False:
|
|
986
|
-
logger.warning(
|
|
987
|
-
f"Unable to resolve column reference {ref} to a permanent table"
|
|
988
|
-
)
|
|
989
|
-
else:
|
|
990
|
-
logger.debug(
|
|
991
|
-
f"Resolved column reference {ref} is not resolved because referenced table {ref.table} is not a temp table or not found. Adding reference as non-temp table. This is normal."
|
|
992
|
-
)
|
|
993
|
-
resolved_column_refs.append(ref)
|
|
994
|
-
return resolved_column_refs
|
|
995
591
|
|
|
996
|
-
|
|
997
|
-
self
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
) -> None:
|
|
1002
|
-
for target_column_lineage in target_dataset_cll:
|
|
1003
|
-
upstreams: List[sqlglot_l.ColumnRef] = []
|
|
1004
|
-
# Look for temp_table_urn in upstream of column_lineage, if found then we need to replace it with
|
|
1005
|
-
# column of permanent table
|
|
1006
|
-
for target_column_ref in target_column_lineage.upstreams:
|
|
1007
|
-
if target_column_ref.table == temp_table_urn:
|
|
1008
|
-
# Look for column_ref.table and column_ref.column in downstream of source_dataset_cll.
|
|
1009
|
-
# The source_dataset_cll contains CLL generated from create statement of temp table (temp_table_urn)
|
|
1010
|
-
for source_column_lineage in source_dataset_cll:
|
|
1011
|
-
if (
|
|
1012
|
-
source_column_lineage.downstream.table
|
|
1013
|
-
== target_column_ref.table
|
|
1014
|
-
and source_column_lineage.downstream.column
|
|
1015
|
-
== target_column_ref.column
|
|
1016
|
-
):
|
|
1017
|
-
resolved_columns = self.resolve_column_refs(
|
|
1018
|
-
source_column_lineage.upstreams
|
|
1019
|
-
)
|
|
1020
|
-
# Add all upstream of above temporary column into upstream of target column
|
|
1021
|
-
upstreams.extend(resolved_columns)
|
|
1022
|
-
continue
|
|
1023
|
-
|
|
1024
|
-
upstreams.append(target_column_ref)
|
|
1025
|
-
|
|
1026
|
-
if upstreams:
|
|
1027
|
-
# update the upstreams
|
|
1028
|
-
target_column_lineage.upstreams = upstreams
|
|
1029
|
-
|
|
1030
|
-
def _add_permanent_datasets_recursively(
|
|
592
|
+
# Populate lineage for external tables.
|
|
593
|
+
if not self.config.skip_external_tables:
|
|
594
|
+
self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
|
|
595
|
+
|
|
596
|
+
def _populate_lineage_agg(
|
|
1031
597
|
self,
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
598
|
+
query: str,
|
|
599
|
+
lineage_type: LineageCollectorType,
|
|
600
|
+
processor: Callable[[LineageRow], None],
|
|
1035
601
|
connection: redshift_connector.Connection,
|
|
1036
|
-
permanent_lineage_datasets: List[LineageDataset],
|
|
1037
|
-
target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
|
|
1038
602
|
) -> None:
|
|
1039
|
-
|
|
603
|
+
logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
|
|
604
|
+
try:
|
|
605
|
+
logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
|
|
1040
606
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
607
|
+
timer = self.report.lineage_phases_timer.setdefault(
|
|
608
|
+
lineage_type.name, PerfTimer()
|
|
609
|
+
)
|
|
610
|
+
with timer:
|
|
611
|
+
for lineage_row in RedshiftDataDictionary.get_lineage_rows(
|
|
612
|
+
conn=connection, query=query
|
|
613
|
+
):
|
|
614
|
+
processor(lineage_row)
|
|
615
|
+
except Exception as e:
|
|
616
|
+
self.report.warning(
|
|
617
|
+
title="Failed to extract some lineage",
|
|
618
|
+
message=f"Failed to extract lineage of type {lineage_type.name}",
|
|
619
|
+
context=f"Query: '{query}'",
|
|
620
|
+
exc=e,
|
|
1044
621
|
)
|
|
622
|
+
self.report_status(f"extract-{lineage_type.name}", False)
|
|
623
|
+
|
|
624
|
+
def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
|
|
625
|
+
ddl = lineage_row.ddl
|
|
626
|
+
if ddl is None:
|
|
627
|
+
return
|
|
628
|
+
|
|
629
|
+
# TODO actor
|
|
1045
630
|
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
query=
|
|
1049
|
-
|
|
631
|
+
self.aggregator.add_observed_query(
|
|
632
|
+
ObservedQuery(
|
|
633
|
+
query=ddl,
|
|
634
|
+
default_db=self.database,
|
|
635
|
+
default_schema=self.config.default_schema,
|
|
636
|
+
timestamp=lineage_row.timestamp,
|
|
637
|
+
session_id=lineage_row.session_id,
|
|
1050
638
|
)
|
|
639
|
+
)
|
|
1051
640
|
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
)
|
|
641
|
+
def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
|
|
642
|
+
target = DatasetUrn.create_from_ids(
|
|
643
|
+
self.platform,
|
|
644
|
+
f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
|
|
645
|
+
env=self.config.env,
|
|
646
|
+
platform_instance=self.config.platform_instance,
|
|
647
|
+
)
|
|
648
|
+
if target.urn() not in self.known_urns:
|
|
649
|
+
logger.debug(
|
|
650
|
+
f"Skipping lineage for {target.urn()} as it is not in known_urns"
|
|
651
|
+
)
|
|
652
|
+
return None
|
|
1062
653
|
|
|
1063
|
-
|
|
1064
|
-
# if such dataset is present then add it to transitive_temp_tables to resolve it to original permanent table
|
|
1065
|
-
for lineage_dataset in intermediate_l_datasets:
|
|
1066
|
-
db, schema, table = split_qualified_table_name(lineage_dataset.urn)
|
|
654
|
+
return target
|
|
1067
655
|
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
656
|
+
def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
|
|
657
|
+
target = self._make_filtered_target(lineage_row)
|
|
658
|
+
if not target:
|
|
659
|
+
return
|
|
1071
660
|
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
661
|
+
source = DatasetUrn.create_from_ids(
|
|
662
|
+
self.platform,
|
|
663
|
+
f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
|
|
664
|
+
env=self.config.env,
|
|
665
|
+
platform_instance=self.config.platform_instance,
|
|
666
|
+
)
|
|
1077
667
|
|
|
1078
|
-
|
|
1079
|
-
|
|
668
|
+
if lineage_row.ddl is None:
|
|
669
|
+
logger.warning(
|
|
670
|
+
f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
|
|
671
|
+
)
|
|
672
|
+
return
|
|
673
|
+
self.aggregator.add_known_query_lineage(
|
|
674
|
+
KnownQueryLineageInfo(
|
|
675
|
+
query_text=lineage_row.ddl,
|
|
676
|
+
downstream=target.urn(),
|
|
677
|
+
upstreams=[source.urn()],
|
|
678
|
+
timestamp=lineage_row.timestamp,
|
|
679
|
+
),
|
|
680
|
+
merge_lineage=True,
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
def _process_view_lineage(self, lineage_row: LineageRow) -> None:
|
|
684
|
+
ddl = lineage_row.ddl
|
|
685
|
+
if ddl is None:
|
|
686
|
+
return
|
|
1080
687
|
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
continue
|
|
688
|
+
target = self._make_filtered_target(lineage_row)
|
|
689
|
+
if not target:
|
|
690
|
+
return
|
|
1085
691
|
|
|
1086
|
-
|
|
692
|
+
self.aggregator.add_view_definition(
|
|
693
|
+
view_urn=target,
|
|
694
|
+
view_definition=ddl,
|
|
695
|
+
default_db=self.database,
|
|
696
|
+
default_schema=self.config.default_schema,
|
|
697
|
+
)
|
|
1087
698
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
699
|
+
def _process_copy_command(self, lineage_row: LineageRow) -> None:
|
|
700
|
+
logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
|
|
701
|
+
sources = self._get_sources(
|
|
702
|
+
lineage_type=LineageCollectorType.COPY,
|
|
703
|
+
db_name=self.database,
|
|
704
|
+
source_schema=None,
|
|
705
|
+
source_table=None,
|
|
706
|
+
ddl=None,
|
|
707
|
+
filename=lineage_row.filename,
|
|
708
|
+
)
|
|
709
|
+
logger.debug(f"Recognized sources: {sources}")
|
|
710
|
+
source = sources[0]
|
|
711
|
+
if not source:
|
|
712
|
+
logger.debug("Ignoring command since couldn't recognize proper source")
|
|
713
|
+
return
|
|
714
|
+
s3_urn = source[0].urn
|
|
715
|
+
logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
|
|
716
|
+
if not lineage_row.target_schema or not lineage_row.target_table:
|
|
717
|
+
logger.debug(
|
|
718
|
+
f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
|
|
1097
719
|
)
|
|
720
|
+
return
|
|
721
|
+
target = self._make_filtered_target(lineage_row)
|
|
722
|
+
if not target:
|
|
723
|
+
return
|
|
1098
724
|
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
db_name: str,
|
|
1102
|
-
temp_table_names: List[str],
|
|
1103
|
-
connection: redshift_connector.Connection,
|
|
1104
|
-
target_source_dataset: List[LineageDataset],
|
|
1105
|
-
target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
|
|
1106
|
-
) -> int:
|
|
1107
|
-
permanent_lineage_datasets: List[LineageDataset] = []
|
|
1108
|
-
|
|
1109
|
-
temp_table_rows: List[TempTableRow] = self.find_temp_tables(
|
|
1110
|
-
temp_table_rows=list(self.temp_tables.values()),
|
|
1111
|
-
temp_table_names=temp_table_names,
|
|
725
|
+
self.aggregator.add_known_lineage_mapping(
|
|
726
|
+
upstream_urn=s3_urn, downstream_urn=target.urn()
|
|
1112
727
|
)
|
|
1113
728
|
|
|
1114
|
-
|
|
729
|
+
def _process_unload_command(self, lineage_row: LineageRow) -> None:
|
|
730
|
+
lineage_entry = self._get_target_lineage(
|
|
731
|
+
alias_db_name=self.database,
|
|
732
|
+
lineage_row=lineage_row,
|
|
733
|
+
lineage_type=LineageCollectorType.UNLOAD,
|
|
734
|
+
all_tables_set={},
|
|
735
|
+
)
|
|
736
|
+
if not lineage_entry:
|
|
737
|
+
return
|
|
738
|
+
output_urn = lineage_entry.dataset.urn
|
|
1115
739
|
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
740
|
+
if not lineage_row.source_schema or not lineage_row.source_table:
|
|
741
|
+
return
|
|
742
|
+
source = DatasetUrn.create_from_ids(
|
|
743
|
+
self.platform,
|
|
744
|
+
f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
|
|
745
|
+
env=self.config.env,
|
|
746
|
+
platform_instance=self.config.platform_instance,
|
|
1123
747
|
)
|
|
748
|
+
if source.urn() not in self.known_urns:
|
|
749
|
+
logger.debug(
|
|
750
|
+
f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
|
|
751
|
+
)
|
|
752
|
+
return
|
|
753
|
+
|
|
754
|
+
self.aggregator.add_known_lineage_mapping(
|
|
755
|
+
upstream_urn=source.urn(), downstream_urn=output_urn
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
def _process_external_tables(
|
|
759
|
+
self,
|
|
760
|
+
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
761
|
+
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
762
|
+
) -> None:
|
|
763
|
+
for schema_name, tables in all_tables[self.database].items():
|
|
764
|
+
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
765
|
+
if not db_schemas[self.database].get(schema_name):
|
|
766
|
+
logger.warning(f"Schema {schema_name} not found")
|
|
767
|
+
continue
|
|
768
|
+
for table in tables:
|
|
769
|
+
schema = db_schemas[self.database][schema_name]
|
|
770
|
+
if (
|
|
771
|
+
table.is_external_table()
|
|
772
|
+
and schema.is_external_schema()
|
|
773
|
+
and schema.external_platform
|
|
774
|
+
):
|
|
775
|
+
logger.info(
|
|
776
|
+
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
777
|
+
)
|
|
778
|
+
# external_db_params = schema.option
|
|
779
|
+
upstream_platform = schema.external_platform.lower()
|
|
780
|
+
|
|
781
|
+
table_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
782
|
+
self.platform,
|
|
783
|
+
f"{self.database}.{schema_name}.{table.name}",
|
|
784
|
+
platform_instance=self.config.platform_instance,
|
|
785
|
+
env=self.config.env,
|
|
786
|
+
)
|
|
787
|
+
if upstream_platform == self.platform:
|
|
788
|
+
upstream_schema = schema.get_upstream_schema_name() or "public"
|
|
789
|
+
upstream_dataset_name = (
|
|
790
|
+
f"{schema.external_database}.{upstream_schema}.{table.name}"
|
|
791
|
+
)
|
|
792
|
+
upstream_platform_instance = self.config.platform_instance
|
|
793
|
+
else:
|
|
794
|
+
upstream_dataset_name = (
|
|
795
|
+
f"{schema.external_database}.{table.name}"
|
|
796
|
+
)
|
|
797
|
+
upstream_platform_instance = (
|
|
798
|
+
self.config.platform_instance_map.get(upstream_platform)
|
|
799
|
+
if self.config.platform_instance_map
|
|
800
|
+
else None
|
|
801
|
+
)
|
|
1124
802
|
|
|
1125
|
-
|
|
803
|
+
upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
804
|
+
upstream_platform,
|
|
805
|
+
upstream_dataset_name,
|
|
806
|
+
platform_instance=upstream_platform_instance,
|
|
807
|
+
env=self.config.env,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
self.aggregator.add_known_lineage_mapping(
|
|
811
|
+
upstream_urn=upstream_urn,
|
|
812
|
+
downstream_urn=table_urn,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
def generate(self) -> Iterable[MetadataWorkUnit]:
|
|
816
|
+
for mcp in self.aggregator.gen_metadata():
|
|
817
|
+
yield mcp.as_workunit()
|
|
818
|
+
if len(self.aggregator.report.observed_query_parse_failures) > 0:
|
|
819
|
+
self.report.report_warning(
|
|
820
|
+
title="Failed to extract some SQL lineage",
|
|
821
|
+
message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
|
|
822
|
+
context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
|
|
823
|
+
)
|
|
1126
824
|
|
|
1127
|
-
|
|
825
|
+
def close(self) -> None:
|
|
826
|
+
self.aggregator.close()
|