acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
datahub/upgrade/upgrade.py
CHANGED
|
@@ -45,6 +45,7 @@ class ServerVersionStats(BaseModel):
|
|
|
45
45
|
latest: Optional[VersionStats] = None
|
|
46
46
|
current_server_type: Optional[str] = None
|
|
47
47
|
current_server_default_cli_version: Optional[VersionStats] = None
|
|
48
|
+
is_cloud_server: Optional[bool] = None
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
class ClientVersionStats(BaseModel):
|
|
@@ -145,7 +146,9 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceCo
|
|
|
145
146
|
|
|
146
147
|
async def get_server_version_stats(
|
|
147
148
|
server: Optional[DataHubGraph] = None,
|
|
148
|
-
) -> Tuple[
|
|
149
|
+
) -> Tuple[
|
|
150
|
+
Optional[str], Optional[Version], Optional[str], Optional[datetime], Optional[bool]
|
|
151
|
+
]:
|
|
149
152
|
import aiohttp
|
|
150
153
|
|
|
151
154
|
server_config: Optional[RestServiceConfig] = None
|
|
@@ -167,11 +170,13 @@ async def get_server_version_stats(
|
|
|
167
170
|
server_version: Optional[Version] = None
|
|
168
171
|
current_server_default_cli_version = None
|
|
169
172
|
current_server_release_date = None
|
|
173
|
+
is_cloud_server: Optional[bool] = None
|
|
170
174
|
if server_config:
|
|
171
175
|
server_version_string = server_config.service_version
|
|
172
176
|
commit_hash = server_config.commit_hash
|
|
173
177
|
server_type = server_config.server_type
|
|
174
178
|
current_server_default_cli_version = server_config.default_cli_version
|
|
179
|
+
is_cloud_server = server_config.is_datahub_cloud
|
|
175
180
|
if server_type == "quickstart" and commit_hash:
|
|
176
181
|
async with aiohttp.ClientSession(
|
|
177
182
|
headers={"Accept": "application/vnd.github.v3+json"}
|
|
@@ -191,6 +196,7 @@ async def get_server_version_stats(
|
|
|
191
196
|
server_version,
|
|
192
197
|
current_server_default_cli_version,
|
|
193
198
|
current_server_release_date,
|
|
199
|
+
is_cloud_server,
|
|
194
200
|
)
|
|
195
201
|
|
|
196
202
|
|
|
@@ -236,6 +242,7 @@ async def _retrieve_version_stats(
|
|
|
236
242
|
current_server_version,
|
|
237
243
|
current_server_default_cli_version,
|
|
238
244
|
current_server_release_date,
|
|
245
|
+
is_cloud_server,
|
|
239
246
|
) = results[2]
|
|
240
247
|
|
|
241
248
|
server_version_stats = None
|
|
@@ -255,6 +262,7 @@ async def _retrieve_version_stats(
|
|
|
255
262
|
else None
|
|
256
263
|
),
|
|
257
264
|
current_server_type=current_server_type,
|
|
265
|
+
is_cloud_server=is_cloud_server,
|
|
258
266
|
)
|
|
259
267
|
|
|
260
268
|
if client_version_stats and server_version_stats:
|
|
@@ -353,7 +361,11 @@ def _maybe_print_upgrade_message(
|
|
|
353
361
|
else None
|
|
354
362
|
)
|
|
355
363
|
client_server_compat = 0
|
|
356
|
-
|
|
364
|
+
# Skip version compatibility checks for cloud servers (serverEnv="cloud")
|
|
365
|
+
# Cloud servers use different versioning schemes between server and CLI
|
|
366
|
+
is_cloud = version_stats.server.is_cloud_server
|
|
367
|
+
|
|
368
|
+
if not is_cloud:
|
|
357
369
|
client_server_compat = is_client_server_compatible(
|
|
358
370
|
version_stats.client.current, version_stats.server.current
|
|
359
371
|
)
|
|
@@ -1,466 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
4
|
-
|
|
5
|
-
import redshift_connector
|
|
6
|
-
|
|
7
|
-
from datahub.emitter import mce_builder
|
|
8
|
-
from datahub.ingestion.api.closeable import Closeable
|
|
9
|
-
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
11
|
-
from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
|
|
12
|
-
from datahub.ingestion.source.redshift.lineage import (
|
|
13
|
-
LineageCollectorType,
|
|
14
|
-
RedshiftLineageExtractor,
|
|
15
|
-
)
|
|
16
|
-
from datahub.ingestion.source.redshift.query import (
|
|
17
|
-
RedshiftCommonQuery,
|
|
18
|
-
RedshiftProvisionedQuery,
|
|
19
|
-
RedshiftServerlessQuery,
|
|
20
|
-
)
|
|
21
|
-
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
22
|
-
LineageRow,
|
|
23
|
-
RedshiftDataDictionary,
|
|
24
|
-
RedshiftSchema,
|
|
25
|
-
RedshiftTable,
|
|
26
|
-
RedshiftView,
|
|
27
|
-
)
|
|
28
|
-
from datahub.ingestion.source.redshift.report import RedshiftReport
|
|
29
|
-
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
30
|
-
RedundantLineageRunSkipHandler,
|
|
31
|
-
)
|
|
32
|
-
from datahub.metadata.urns import DatasetUrn
|
|
33
|
-
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
34
|
-
KnownQueryLineageInfo,
|
|
35
|
-
ObservedQuery,
|
|
36
|
-
SqlParsingAggregator,
|
|
37
|
-
)
|
|
38
|
-
from datahub.utilities.perf_timer import PerfTimer
|
|
39
|
-
|
|
40
|
-
logger = logging.getLogger(__name__)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class RedshiftSqlLineageV2(Closeable):
|
|
44
|
-
# does lineage and usage based on SQL parsing.
|
|
45
|
-
|
|
46
|
-
def __init__(
|
|
47
|
-
self,
|
|
48
|
-
config: RedshiftConfig,
|
|
49
|
-
report: RedshiftReport,
|
|
50
|
-
context: PipelineContext,
|
|
51
|
-
database: str,
|
|
52
|
-
redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
|
|
53
|
-
):
|
|
54
|
-
self.platform = "redshift"
|
|
55
|
-
self.config = config
|
|
56
|
-
self.report = report
|
|
57
|
-
self.context = context
|
|
58
|
-
|
|
59
|
-
self.database = database
|
|
60
|
-
self.known_urns: Set[str] = set() # will be set later
|
|
61
|
-
|
|
62
|
-
self.aggregator = SqlParsingAggregator(
|
|
63
|
-
platform=self.platform,
|
|
64
|
-
platform_instance=self.config.platform_instance,
|
|
65
|
-
env=self.config.env,
|
|
66
|
-
generate_lineage=True,
|
|
67
|
-
generate_queries=self.config.lineage_v2_generate_queries,
|
|
68
|
-
generate_usage_statistics=False,
|
|
69
|
-
generate_operations=False,
|
|
70
|
-
usage_config=self.config,
|
|
71
|
-
graph=self.context.graph,
|
|
72
|
-
is_temp_table=self._is_temp_table,
|
|
73
|
-
)
|
|
74
|
-
self.report.sql_aggregator = self.aggregator.report
|
|
75
|
-
|
|
76
|
-
self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
|
|
77
|
-
if self.config.is_serverless:
|
|
78
|
-
self.queries = RedshiftServerlessQuery()
|
|
79
|
-
|
|
80
|
-
self._lineage_v1 = RedshiftLineageExtractor(
|
|
81
|
-
config=config,
|
|
82
|
-
report=report,
|
|
83
|
-
context=context,
|
|
84
|
-
redundant_run_skip_handler=redundant_run_skip_handler,
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
self.start_time, self.end_time = (
|
|
88
|
-
self.report.lineage_start_time,
|
|
89
|
-
self.report.lineage_end_time,
|
|
90
|
-
) = self._lineage_v1.get_time_window()
|
|
91
|
-
|
|
92
|
-
def _is_temp_table(self, name: str) -> bool:
|
|
93
|
-
return (
|
|
94
|
-
DatasetUrn.create_from_ids(
|
|
95
|
-
self.platform,
|
|
96
|
-
name,
|
|
97
|
-
env=self.config.env,
|
|
98
|
-
platform_instance=self.config.platform_instance,
|
|
99
|
-
).urn()
|
|
100
|
-
not in self.known_urns
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
def build(
|
|
104
|
-
self,
|
|
105
|
-
connection: redshift_connector.Connection,
|
|
106
|
-
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
107
|
-
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
108
|
-
) -> None:
|
|
109
|
-
# Assume things not in `all_tables` as temp tables.
|
|
110
|
-
self.known_urns = {
|
|
111
|
-
DatasetUrn.create_from_ids(
|
|
112
|
-
self.platform,
|
|
113
|
-
f"{db}.{schema}.{table.name}",
|
|
114
|
-
env=self.config.env,
|
|
115
|
-
platform_instance=self.config.platform_instance,
|
|
116
|
-
).urn()
|
|
117
|
-
for db, schemas in all_tables.items()
|
|
118
|
-
for schema, tables in schemas.items()
|
|
119
|
-
for table in tables
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
# Handle all the temp tables up front.
|
|
123
|
-
if self.config.resolve_temp_table_in_lineage:
|
|
124
|
-
for temp_row in self._lineage_v1.get_temp_tables(connection=connection):
|
|
125
|
-
self.aggregator.add_observed_query(
|
|
126
|
-
ObservedQuery(
|
|
127
|
-
query=temp_row.query_text,
|
|
128
|
-
default_db=self.database,
|
|
129
|
-
default_schema=self.config.default_schema,
|
|
130
|
-
session_id=temp_row.session_id,
|
|
131
|
-
timestamp=temp_row.start_time,
|
|
132
|
-
),
|
|
133
|
-
# The "temp table" query actually returns all CREATE TABLE statements, even if they
|
|
134
|
-
# aren't explicitly a temp table. As such, setting is_known_temp_table=True
|
|
135
|
-
# would not be correct. We already have mechanisms to autodetect temp tables,
|
|
136
|
-
# so we won't lose anything by not setting it.
|
|
137
|
-
is_known_temp_table=False,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
|
|
141
|
-
|
|
142
|
-
if self.config.include_table_rename_lineage:
|
|
143
|
-
# Process all the ALTER TABLE RENAME statements
|
|
144
|
-
table_renames, _ = self._lineage_v1._process_table_renames(
|
|
145
|
-
database=self.database,
|
|
146
|
-
connection=connection,
|
|
147
|
-
all_tables=collections.defaultdict(
|
|
148
|
-
lambda: collections.defaultdict(set)
|
|
149
|
-
),
|
|
150
|
-
)
|
|
151
|
-
for entry in table_renames.values():
|
|
152
|
-
self.aggregator.add_table_rename(entry)
|
|
153
|
-
|
|
154
|
-
if self.config.table_lineage_mode in {
|
|
155
|
-
LineageMode.SQL_BASED,
|
|
156
|
-
LineageMode.MIXED,
|
|
157
|
-
}:
|
|
158
|
-
# Populate lineage by parsing table creating sqls
|
|
159
|
-
query = self.queries.list_insert_create_queries_sql(
|
|
160
|
-
db_name=self.database,
|
|
161
|
-
start_time=self.start_time,
|
|
162
|
-
end_time=self.end_time,
|
|
163
|
-
)
|
|
164
|
-
populate_calls.append(
|
|
165
|
-
(
|
|
166
|
-
LineageCollectorType.QUERY_SQL_PARSER,
|
|
167
|
-
query,
|
|
168
|
-
self._process_sql_parser_lineage,
|
|
169
|
-
)
|
|
170
|
-
)
|
|
171
|
-
if self.config.table_lineage_mode in {
|
|
172
|
-
LineageMode.STL_SCAN_BASED,
|
|
173
|
-
LineageMode.MIXED,
|
|
174
|
-
}:
|
|
175
|
-
# Populate lineage by getting upstream tables from stl_scan redshift table
|
|
176
|
-
query = self.queries.stl_scan_based_lineage_query(
|
|
177
|
-
self.database,
|
|
178
|
-
self.start_time,
|
|
179
|
-
self.end_time,
|
|
180
|
-
)
|
|
181
|
-
populate_calls.append(
|
|
182
|
-
(LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
if self.config.include_views and self.config.include_view_lineage:
|
|
186
|
-
# Populate lineage for views
|
|
187
|
-
query = self.queries.view_lineage_query()
|
|
188
|
-
populate_calls.append(
|
|
189
|
-
(LineageCollectorType.VIEW, query, self._process_view_lineage)
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
# Populate lineage for late binding views
|
|
193
|
-
query = self.queries.list_late_view_ddls_query()
|
|
194
|
-
populate_calls.append(
|
|
195
|
-
(
|
|
196
|
-
LineageCollectorType.VIEW_DDL_SQL_PARSING,
|
|
197
|
-
query,
|
|
198
|
-
self._process_view_lineage,
|
|
199
|
-
)
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
if self.config.include_copy_lineage:
|
|
203
|
-
# Populate lineage for copy commands.
|
|
204
|
-
query = self.queries.list_copy_commands_sql(
|
|
205
|
-
db_name=self.database,
|
|
206
|
-
start_time=self.start_time,
|
|
207
|
-
end_time=self.end_time,
|
|
208
|
-
)
|
|
209
|
-
populate_calls.append(
|
|
210
|
-
(LineageCollectorType.COPY, query, self._process_copy_command)
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
if self.config.include_unload_lineage:
|
|
214
|
-
# Populate lineage for unload commands.
|
|
215
|
-
query = self.queries.list_unload_commands_sql(
|
|
216
|
-
db_name=self.database,
|
|
217
|
-
start_time=self.start_time,
|
|
218
|
-
end_time=self.end_time,
|
|
219
|
-
)
|
|
220
|
-
populate_calls.append(
|
|
221
|
-
(LineageCollectorType.UNLOAD, query, self._process_unload_command)
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
for lineage_type, query, processor in populate_calls:
|
|
225
|
-
self._populate_lineage_agg(
|
|
226
|
-
query=query,
|
|
227
|
-
lineage_type=lineage_type,
|
|
228
|
-
processor=processor,
|
|
229
|
-
connection=connection,
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
# Populate lineage for external tables.
|
|
233
|
-
if not self.config.skip_external_tables:
|
|
234
|
-
self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
|
|
235
|
-
|
|
236
|
-
def _populate_lineage_agg(
|
|
237
|
-
self,
|
|
238
|
-
query: str,
|
|
239
|
-
lineage_type: LineageCollectorType,
|
|
240
|
-
processor: Callable[[LineageRow], None],
|
|
241
|
-
connection: redshift_connector.Connection,
|
|
242
|
-
) -> None:
|
|
243
|
-
logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
|
|
244
|
-
try:
|
|
245
|
-
logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
|
|
246
|
-
|
|
247
|
-
timer = self.report.lineage_phases_timer.setdefault(
|
|
248
|
-
lineage_type.name, PerfTimer()
|
|
249
|
-
)
|
|
250
|
-
with timer:
|
|
251
|
-
for lineage_row in RedshiftDataDictionary.get_lineage_rows(
|
|
252
|
-
conn=connection, query=query
|
|
253
|
-
):
|
|
254
|
-
processor(lineage_row)
|
|
255
|
-
except Exception as e:
|
|
256
|
-
self.report.warning(
|
|
257
|
-
title="Failed to extract some lineage",
|
|
258
|
-
message=f"Failed to extract lineage of type {lineage_type.name}",
|
|
259
|
-
context=f"Query: '{query}'",
|
|
260
|
-
exc=e,
|
|
261
|
-
)
|
|
262
|
-
self._lineage_v1.report_status(f"extract-{lineage_type.name}", False)
|
|
263
|
-
|
|
264
|
-
def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
|
|
265
|
-
ddl = lineage_row.ddl
|
|
266
|
-
if ddl is None:
|
|
267
|
-
return
|
|
268
|
-
|
|
269
|
-
# TODO actor
|
|
270
|
-
|
|
271
|
-
self.aggregator.add_observed_query(
|
|
272
|
-
ObservedQuery(
|
|
273
|
-
query=ddl,
|
|
274
|
-
default_db=self.database,
|
|
275
|
-
default_schema=self.config.default_schema,
|
|
276
|
-
timestamp=lineage_row.timestamp,
|
|
277
|
-
session_id=lineage_row.session_id,
|
|
278
|
-
)
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
|
|
282
|
-
target = DatasetUrn.create_from_ids(
|
|
283
|
-
self.platform,
|
|
284
|
-
f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
|
|
285
|
-
env=self.config.env,
|
|
286
|
-
platform_instance=self.config.platform_instance,
|
|
287
|
-
)
|
|
288
|
-
if target.urn() not in self.known_urns:
|
|
289
|
-
logger.debug(
|
|
290
|
-
f"Skipping lineage for {target.urn()} as it is not in known_urns"
|
|
291
|
-
)
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
return target
|
|
295
|
-
|
|
296
|
-
def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
|
|
297
|
-
target = self._make_filtered_target(lineage_row)
|
|
298
|
-
if not target:
|
|
299
|
-
return
|
|
300
|
-
|
|
301
|
-
source = DatasetUrn.create_from_ids(
|
|
302
|
-
self.platform,
|
|
303
|
-
f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
|
|
304
|
-
env=self.config.env,
|
|
305
|
-
platform_instance=self.config.platform_instance,
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
if lineage_row.ddl is None:
|
|
309
|
-
logger.warning(
|
|
310
|
-
f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
|
|
311
|
-
)
|
|
312
|
-
return
|
|
313
|
-
self.aggregator.add_known_query_lineage(
|
|
314
|
-
KnownQueryLineageInfo(
|
|
315
|
-
query_text=lineage_row.ddl,
|
|
316
|
-
downstream=target.urn(),
|
|
317
|
-
upstreams=[source.urn()],
|
|
318
|
-
timestamp=lineage_row.timestamp,
|
|
319
|
-
),
|
|
320
|
-
merge_lineage=True,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
def _process_view_lineage(self, lineage_row: LineageRow) -> None:
|
|
324
|
-
ddl = lineage_row.ddl
|
|
325
|
-
if ddl is None:
|
|
326
|
-
return
|
|
327
|
-
|
|
328
|
-
target = self._make_filtered_target(lineage_row)
|
|
329
|
-
if not target:
|
|
330
|
-
return
|
|
331
|
-
|
|
332
|
-
self.aggregator.add_view_definition(
|
|
333
|
-
view_urn=target,
|
|
334
|
-
view_definition=ddl,
|
|
335
|
-
default_db=self.database,
|
|
336
|
-
default_schema=self.config.default_schema,
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
def _process_copy_command(self, lineage_row: LineageRow) -> None:
|
|
340
|
-
logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
|
|
341
|
-
sources = self._lineage_v1._get_sources(
|
|
342
|
-
lineage_type=LineageCollectorType.COPY,
|
|
343
|
-
db_name=self.database,
|
|
344
|
-
source_schema=None,
|
|
345
|
-
source_table=None,
|
|
346
|
-
ddl=None,
|
|
347
|
-
filename=lineage_row.filename,
|
|
348
|
-
)
|
|
349
|
-
logger.debug(f"Recognized sources: {sources}")
|
|
350
|
-
source = sources[0]
|
|
351
|
-
if not source:
|
|
352
|
-
logger.debug("Ignoring command since couldn't recognize proper source")
|
|
353
|
-
return
|
|
354
|
-
s3_urn = source[0].urn
|
|
355
|
-
logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
|
|
356
|
-
if not lineage_row.target_schema or not lineage_row.target_table:
|
|
357
|
-
logger.debug(
|
|
358
|
-
f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
|
|
359
|
-
)
|
|
360
|
-
return
|
|
361
|
-
target = self._make_filtered_target(lineage_row)
|
|
362
|
-
if not target:
|
|
363
|
-
return
|
|
364
|
-
|
|
365
|
-
self.aggregator.add_known_lineage_mapping(
|
|
366
|
-
upstream_urn=s3_urn, downstream_urn=target.urn()
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
def _process_unload_command(self, lineage_row: LineageRow) -> None:
|
|
370
|
-
lineage_entry = self._lineage_v1._get_target_lineage(
|
|
371
|
-
alias_db_name=self.database,
|
|
372
|
-
lineage_row=lineage_row,
|
|
373
|
-
lineage_type=LineageCollectorType.UNLOAD,
|
|
374
|
-
all_tables_set={},
|
|
375
|
-
)
|
|
376
|
-
if not lineage_entry:
|
|
377
|
-
return
|
|
378
|
-
output_urn = lineage_entry.dataset.urn
|
|
379
|
-
|
|
380
|
-
if not lineage_row.source_schema or not lineage_row.source_table:
|
|
381
|
-
return
|
|
382
|
-
source = DatasetUrn.create_from_ids(
|
|
383
|
-
self.platform,
|
|
384
|
-
f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
|
|
385
|
-
env=self.config.env,
|
|
386
|
-
platform_instance=self.config.platform_instance,
|
|
387
|
-
)
|
|
388
|
-
if source.urn() not in self.known_urns:
|
|
389
|
-
logger.debug(
|
|
390
|
-
f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
|
|
391
|
-
)
|
|
392
|
-
return
|
|
393
|
-
|
|
394
|
-
self.aggregator.add_known_lineage_mapping(
|
|
395
|
-
upstream_urn=source.urn(), downstream_urn=output_urn
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
def _process_external_tables(
|
|
399
|
-
self,
|
|
400
|
-
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
401
|
-
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
402
|
-
) -> None:
|
|
403
|
-
for schema_name, tables in all_tables[self.database].items():
|
|
404
|
-
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
405
|
-
if not db_schemas[self.database].get(schema_name):
|
|
406
|
-
logger.warning(f"Schema {schema_name} not found")
|
|
407
|
-
continue
|
|
408
|
-
for table in tables:
|
|
409
|
-
schema = db_schemas[self.database][schema_name]
|
|
410
|
-
if (
|
|
411
|
-
table.is_external_table()
|
|
412
|
-
and schema.is_external_schema()
|
|
413
|
-
and schema.external_platform
|
|
414
|
-
):
|
|
415
|
-
logger.info(
|
|
416
|
-
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
417
|
-
)
|
|
418
|
-
# external_db_params = schema.option
|
|
419
|
-
upstream_platform = schema.external_platform.lower()
|
|
420
|
-
|
|
421
|
-
table_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
422
|
-
self.platform,
|
|
423
|
-
f"{self.database}.{schema_name}.{table.name}",
|
|
424
|
-
platform_instance=self.config.platform_instance,
|
|
425
|
-
env=self.config.env,
|
|
426
|
-
)
|
|
427
|
-
if upstream_platform == self.platform:
|
|
428
|
-
upstream_schema = schema.get_upstream_schema_name() or "public"
|
|
429
|
-
upstream_dataset_name = (
|
|
430
|
-
f"{schema.external_database}.{upstream_schema}.{table.name}"
|
|
431
|
-
)
|
|
432
|
-
upstream_platform_instance = self.config.platform_instance
|
|
433
|
-
else:
|
|
434
|
-
upstream_dataset_name = (
|
|
435
|
-
f"{schema.external_database}.{table.name}"
|
|
436
|
-
)
|
|
437
|
-
upstream_platform_instance = (
|
|
438
|
-
self.config.platform_instance_map.get(upstream_platform)
|
|
439
|
-
if self.config.platform_instance_map
|
|
440
|
-
else None
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
444
|
-
upstream_platform,
|
|
445
|
-
upstream_dataset_name,
|
|
446
|
-
platform_instance=upstream_platform_instance,
|
|
447
|
-
env=self.config.env,
|
|
448
|
-
)
|
|
449
|
-
|
|
450
|
-
self.aggregator.add_known_lineage_mapping(
|
|
451
|
-
upstream_urn=upstream_urn,
|
|
452
|
-
downstream_urn=table_urn,
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
def generate(self) -> Iterable[MetadataWorkUnit]:
|
|
456
|
-
for mcp in self.aggregator.gen_metadata():
|
|
457
|
-
yield mcp.as_workunit()
|
|
458
|
-
if len(self.aggregator.report.observed_query_parse_failures) > 0:
|
|
459
|
-
self.report.report_warning(
|
|
460
|
-
title="Failed to extract some SQL lineage",
|
|
461
|
-
message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
|
|
462
|
-
context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
|
|
463
|
-
)
|
|
464
|
-
|
|
465
|
-
def close(self) -> None:
|
|
466
|
-
self.aggregator.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|