acryl-datahub 0.15.0rc14__py3-none-any.whl → 0.15.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc14.dist-info → acryl_datahub-0.15.0rc16.dist-info}/METADATA +2414 -2430
- {acryl_datahub-0.15.0rc14.dist-info → acryl_datahub-0.15.0rc16.dist-info}/RECORD +24 -26
- datahub/__init__.py +1 -1
- datahub/cli/cli_utils.py +2 -0
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source_helpers.py +3 -1
- datahub/ingestion/sink/datahub_rest.py +3 -3
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/gc/datahub_gc.py +5 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/tableau/tableau.py +107 -30
- datahub/ingestion/source/unity/source.py +2 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/specific/datajob.py +4 -10
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.15.0rc14.dist-info → acryl_datahub-0.15.0rc16.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc14.dist-info → acryl_datahub-0.15.0rc16.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc14.dist-info → acryl_datahub-0.15.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -485,6 +485,18 @@ class TableauConfig(
|
|
|
485
485
|
description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
|
|
486
486
|
)
|
|
487
487
|
|
|
488
|
+
ingest_hidden_assets: bool = Field(
|
|
489
|
+
True,
|
|
490
|
+
description="When enabled, hidden views and dashboards are ingested into Datahub. "
|
|
491
|
+
"If a dashboard or view is hidden in Tableau the luid is blank. Default of this config field is True.",
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
tags_for_hidden_assets: List[str] = Field(
|
|
495
|
+
default=[],
|
|
496
|
+
description="Tags to be added to hidden dashboards and views. If a dashboard or view is hidden in Tableau the luid is blank. "
|
|
497
|
+
"This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
|
|
498
|
+
)
|
|
499
|
+
|
|
488
500
|
# pre = True because we want to take some decision before pydantic initialize the configuration to default values
|
|
489
501
|
@root_validator(pre=True)
|
|
490
502
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
@@ -510,6 +522,20 @@ class TableauConfig(
|
|
|
510
522
|
|
|
511
523
|
return values
|
|
512
524
|
|
|
525
|
+
@root_validator()
|
|
526
|
+
def validate_config_values(cls, values: Dict) -> Dict:
|
|
527
|
+
tags_for_hidden_assets = values.get("tags_for_hidden_assets")
|
|
528
|
+
ingest_tags = values.get("ingest_tags")
|
|
529
|
+
if (
|
|
530
|
+
not ingest_tags
|
|
531
|
+
and tags_for_hidden_assets
|
|
532
|
+
and len(tags_for_hidden_assets) > 0
|
|
533
|
+
):
|
|
534
|
+
raise ValueError(
|
|
535
|
+
"tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
|
|
536
|
+
)
|
|
537
|
+
return values
|
|
538
|
+
|
|
513
539
|
|
|
514
540
|
class WorkbookKey(ContainerKey):
|
|
515
541
|
workbook_id: str
|
|
@@ -596,7 +622,16 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
596
622
|
num_datasource_field_skipped_no_name: int = 0
|
|
597
623
|
num_csql_field_skipped_no_name: int = 0
|
|
598
624
|
num_table_field_skipped_no_name: int = 0
|
|
625
|
+
# lineage
|
|
626
|
+
num_tables_with_upstream_lineage: int = 0
|
|
627
|
+
num_upstream_table_lineage: int = 0
|
|
628
|
+
num_upstream_fine_grained_lineage: int = 0
|
|
599
629
|
num_upstream_table_skipped_no_name: int = 0
|
|
630
|
+
num_upstream_table_skipped_no_columns: int = 0
|
|
631
|
+
num_upstream_table_failed_generate_reference: int = 0
|
|
632
|
+
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
633
|
+
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
634
|
+
num_hidden_assets_skipped: int = 0
|
|
600
635
|
|
|
601
636
|
|
|
602
637
|
@platform_name("Tableau")
|
|
@@ -1043,6 +1078,11 @@ class TableauSiteSource:
|
|
|
1043
1078
|
),
|
|
1044
1079
|
)
|
|
1045
1080
|
|
|
1081
|
+
def _is_hidden_view(self, dashboard_or_view: Dict) -> bool:
|
|
1082
|
+
# LUID is blank if the view is hidden in the workbook.
|
|
1083
|
+
# More info here: https://help.tableau.com/current/api/metadata_api/en-us/reference/view.doc.html
|
|
1084
|
+
return not dashboard_or_view.get(c.LUID)
|
|
1085
|
+
|
|
1046
1086
|
def get_connection_object_page(
|
|
1047
1087
|
self,
|
|
1048
1088
|
query: str,
|
|
@@ -1311,7 +1351,7 @@ class TableauSiteSource:
|
|
|
1311
1351
|
datasource: dict,
|
|
1312
1352
|
browse_path: Optional[str],
|
|
1313
1353
|
is_embedded_ds: bool = False,
|
|
1314
|
-
) -> Tuple:
|
|
1354
|
+
) -> Tuple[List[Upstream], List[FineGrainedLineage]]:
|
|
1315
1355
|
upstream_tables: List[Upstream] = []
|
|
1316
1356
|
fine_grained_lineages: List[FineGrainedLineage] = []
|
|
1317
1357
|
table_id_to_urn = {}
|
|
@@ -1472,6 +1512,7 @@ class TableauSiteSource:
|
|
|
1472
1512
|
c.COLUMNS_CONNECTION
|
|
1473
1513
|
].get("totalCount")
|
|
1474
1514
|
if not is_custom_sql and not num_tbl_cols:
|
|
1515
|
+
self.report.num_upstream_table_skipped_no_columns += 1
|
|
1475
1516
|
logger.warning(
|
|
1476
1517
|
f"Skipping upstream table with id {table[c.ID]}, no columns: {table}"
|
|
1477
1518
|
)
|
|
@@ -1488,6 +1529,7 @@ class TableauSiteSource:
|
|
|
1488
1529
|
table, default_schema_map=self.config.default_schema_map
|
|
1489
1530
|
)
|
|
1490
1531
|
except Exception as e:
|
|
1532
|
+
self.report.num_upstream_table_failed_generate_reference += 1
|
|
1491
1533
|
self.report.warning(
|
|
1492
1534
|
title="Potentially Missing Lineage Issue",
|
|
1493
1535
|
message="Failed to generate upstream reference",
|
|
@@ -1659,15 +1701,7 @@ class TableauSiteSource:
|
|
|
1659
1701
|
func_overridden_info=None, # Here we don't want to override any information from configuration
|
|
1660
1702
|
)
|
|
1661
1703
|
|
|
1662
|
-
if parsed_result is None:
|
|
1663
|
-
logger.info(
|
|
1664
|
-
f"Failed to extract column level lineage from datasource {datasource_urn}"
|
|
1665
|
-
)
|
|
1666
|
-
return []
|
|
1667
|
-
if parsed_result.debug_info.error:
|
|
1668
|
-
logger.info(
|
|
1669
|
-
f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.error}"
|
|
1670
|
-
)
|
|
1704
|
+
if parsed_result is None or parsed_result.debug_info.error:
|
|
1671
1705
|
return []
|
|
1672
1706
|
|
|
1673
1707
|
cll: List[ColumnLineageInfo] = (
|
|
@@ -2031,6 +2065,8 @@ class TableauSiteSource:
|
|
|
2031
2065
|
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2032
2066
|
aspect=upstream_lineage,
|
|
2033
2067
|
)
|
|
2068
|
+
self.report.num_tables_with_upstream_lineage += 1
|
|
2069
|
+
self.report.num_upstream_table_lineage += len(upstream_tables)
|
|
2034
2070
|
|
|
2035
2071
|
@staticmethod
|
|
2036
2072
|
def _clean_tableau_query_parameters(query: str) -> str:
|
|
@@ -2130,7 +2166,7 @@ class TableauSiteSource:
|
|
|
2130
2166
|
f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}"
|
|
2131
2167
|
)
|
|
2132
2168
|
|
|
2133
|
-
|
|
2169
|
+
parsed_result = create_lineage_sql_parsed_result(
|
|
2134
2170
|
query=query,
|
|
2135
2171
|
default_db=upstream_db,
|
|
2136
2172
|
platform=platform,
|
|
@@ -2140,6 +2176,21 @@ class TableauSiteSource:
|
|
|
2140
2176
|
schema_aware=not self.config.sql_parsing_disable_schema_awareness,
|
|
2141
2177
|
)
|
|
2142
2178
|
|
|
2179
|
+
assert parsed_result is not None
|
|
2180
|
+
|
|
2181
|
+
if parsed_result.debug_info.table_error:
|
|
2182
|
+
logger.warning(
|
|
2183
|
+
f"Failed to extract table lineage from datasource {datasource_urn}: {parsed_result.debug_info.table_error}"
|
|
2184
|
+
)
|
|
2185
|
+
self.report.num_upstream_table_lineage_failed_parse_sql += 1
|
|
2186
|
+
elif parsed_result.debug_info.column_error:
|
|
2187
|
+
logger.warning(
|
|
2188
|
+
f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.column_error}"
|
|
2189
|
+
)
|
|
2190
|
+
self.report.num_upstream_fine_grained_lineage_failed_parse_sql += 1
|
|
2191
|
+
|
|
2192
|
+
return parsed_result
|
|
2193
|
+
|
|
2143
2194
|
def _enrich_database_tables_with_parsed_schemas(
|
|
2144
2195
|
self, parsing_result: SqlParsingResult
|
|
2145
2196
|
) -> None:
|
|
@@ -2174,9 +2225,6 @@ class TableauSiteSource:
|
|
|
2174
2225
|
)
|
|
2175
2226
|
|
|
2176
2227
|
if parsed_result is None:
|
|
2177
|
-
logger.info(
|
|
2178
|
-
f"Failed to extract table level lineage for datasource {csql_urn}"
|
|
2179
|
-
)
|
|
2180
2228
|
return
|
|
2181
2229
|
|
|
2182
2230
|
self._enrich_database_tables_with_parsed_schemas(parsed_result)
|
|
@@ -2196,12 +2244,14 @@ class TableauSiteSource:
|
|
|
2196
2244
|
upstreams=upstream_tables,
|
|
2197
2245
|
fineGrainedLineages=fine_grained_lineages,
|
|
2198
2246
|
)
|
|
2199
|
-
|
|
2200
2247
|
yield self.get_metadata_change_proposal(
|
|
2201
2248
|
csql_urn,
|
|
2202
2249
|
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2203
2250
|
aspect=upstream_lineage,
|
|
2204
2251
|
)
|
|
2252
|
+
self.report.num_tables_with_upstream_lineage += 1
|
|
2253
|
+
self.report.num_upstream_table_lineage += len(upstream_tables)
|
|
2254
|
+
self.report.num_upstream_fine_grained_lineage += len(fine_grained_lineages)
|
|
2205
2255
|
|
|
2206
2256
|
def _get_schema_metadata_for_datasource(
|
|
2207
2257
|
self, datasource_fields: List[dict]
|
|
@@ -2278,12 +2328,11 @@ class TableauSiteSource:
|
|
|
2278
2328
|
)
|
|
2279
2329
|
|
|
2280
2330
|
# Tags
|
|
2281
|
-
if datasource_info:
|
|
2331
|
+
if datasource_info and self.config.ingest_tags:
|
|
2282
2332
|
tags = self.get_tags(datasource_info)
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
)
|
|
2333
|
+
dataset_snapshot.aspects.append(
|
|
2334
|
+
builder.make_global_tag_aspect_with_tag_list(tags)
|
|
2335
|
+
)
|
|
2287
2336
|
|
|
2288
2337
|
# Browse path
|
|
2289
2338
|
if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME):
|
|
@@ -2352,6 +2401,11 @@ class TableauSiteSource:
|
|
|
2352
2401
|
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2353
2402
|
aspect=upstream_lineage,
|
|
2354
2403
|
)
|
|
2404
|
+
self.report.num_tables_with_upstream_lineage += 1
|
|
2405
|
+
self.report.num_upstream_table_lineage += len(upstream_tables)
|
|
2406
|
+
self.report.num_upstream_fine_grained_lineage += len(
|
|
2407
|
+
fine_grained_lineages
|
|
2408
|
+
)
|
|
2355
2409
|
|
|
2356
2410
|
# Datasource Fields
|
|
2357
2411
|
schema_metadata = self._get_schema_metadata_for_datasource(
|
|
@@ -2669,7 +2723,13 @@ class TableauSiteSource:
|
|
|
2669
2723
|
c.SHEETS_CONNECTION,
|
|
2670
2724
|
sheets_filter,
|
|
2671
2725
|
):
|
|
2672
|
-
|
|
2726
|
+
if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
|
|
2727
|
+
yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
|
|
2728
|
+
else:
|
|
2729
|
+
self.report.num_hidden_assets_skipped += 1
|
|
2730
|
+
logger.debug(
|
|
2731
|
+
f"Skip view {sheet.get(c.ID)} because it's hidden (luid is blank)."
|
|
2732
|
+
)
|
|
2673
2733
|
|
|
2674
2734
|
def emit_sheets_as_charts(
|
|
2675
2735
|
self, sheet: dict, workbook: Optional[Dict]
|
|
@@ -2760,11 +2820,17 @@ class TableauSiteSource:
|
|
|
2760
2820
|
chart_snapshot.aspects.append(owner)
|
|
2761
2821
|
|
|
2762
2822
|
# Tags
|
|
2763
|
-
|
|
2764
|
-
|
|
2823
|
+
if self.config.ingest_tags:
|
|
2824
|
+
tags = self.get_tags(sheet)
|
|
2825
|
+
if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
|
|
2826
|
+
sheet
|
|
2827
|
+
):
|
|
2828
|
+
tags.extend(self.config.tags_for_hidden_assets)
|
|
2829
|
+
|
|
2765
2830
|
chart_snapshot.aspects.append(
|
|
2766
2831
|
builder.make_global_tag_aspect_with_tag_list(tags)
|
|
2767
2832
|
)
|
|
2833
|
+
|
|
2768
2834
|
yield self.get_metadata_change_event(chart_snapshot)
|
|
2769
2835
|
if sheet_external_url is not None and self.config.ingest_embed_url is True:
|
|
2770
2836
|
yield self.new_work_unit(
|
|
@@ -2846,7 +2912,7 @@ class TableauSiteSource:
|
|
|
2846
2912
|
else None
|
|
2847
2913
|
)
|
|
2848
2914
|
|
|
2849
|
-
tags = self.get_tags(workbook)
|
|
2915
|
+
tags = self.get_tags(workbook) if self.config.ingest_tags else None
|
|
2850
2916
|
|
|
2851
2917
|
parent_key = None
|
|
2852
2918
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
@@ -2977,17 +3043,23 @@ class TableauSiteSource:
|
|
|
2977
3043
|
c.DASHBOARDS_CONNECTION,
|
|
2978
3044
|
dashboards_filter,
|
|
2979
3045
|
):
|
|
2980
|
-
|
|
3046
|
+
if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
|
|
3047
|
+
yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
|
|
3048
|
+
else:
|
|
3049
|
+
self.report.num_hidden_assets_skipped += 1
|
|
3050
|
+
logger.debug(
|
|
3051
|
+
f"Skip dashboard {dashboard.get(c.ID)} because it's hidden (luid is blank)."
|
|
3052
|
+
)
|
|
2981
3053
|
|
|
2982
|
-
def get_tags(self, obj: dict) ->
|
|
3054
|
+
def get_tags(self, obj: dict) -> List[str]:
|
|
2983
3055
|
tag_list = obj.get(c.TAGS, [])
|
|
2984
|
-
if tag_list
|
|
3056
|
+
if tag_list:
|
|
2985
3057
|
tag_list_str = [
|
|
2986
3058
|
t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME)
|
|
2987
3059
|
]
|
|
2988
3060
|
|
|
2989
3061
|
return tag_list_str
|
|
2990
|
-
return
|
|
3062
|
+
return []
|
|
2991
3063
|
|
|
2992
3064
|
def emit_dashboard(
|
|
2993
3065
|
self, dashboard: dict, workbook: Optional[Dict]
|
|
@@ -3038,8 +3110,13 @@ class TableauSiteSource:
|
|
|
3038
3110
|
)
|
|
3039
3111
|
dashboard_snapshot.aspects.append(dashboard_info_class)
|
|
3040
3112
|
|
|
3041
|
-
|
|
3042
|
-
|
|
3113
|
+
if self.config.ingest_tags:
|
|
3114
|
+
tags = self.get_tags(dashboard)
|
|
3115
|
+
if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
|
|
3116
|
+
dashboard
|
|
3117
|
+
):
|
|
3118
|
+
tags.extend(self.config.tags_for_hidden_assets)
|
|
3119
|
+
|
|
3043
3120
|
dashboard_snapshot.aspects.append(
|
|
3044
3121
|
builder.make_global_tag_aspect_with_tag_list(tags)
|
|
3045
3122
|
)
|
|
@@ -556,6 +556,8 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
556
556
|
)
|
|
557
557
|
|
|
558
558
|
if table_props:
|
|
559
|
+
# TODO: use auto_incremental_properties workunit processor instead
|
|
560
|
+
# Consider enabling incremental_properties by default
|
|
559
561
|
patch_builder = create_dataset_props_patch_builder(dataset_urn, table_props)
|
|
560
562
|
for patch_mcp in patch_builder.build():
|
|
561
563
|
yield MetadataWorkUnit(
|
|
@@ -7,7 +7,6 @@ from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Set,
|
|
|
7
7
|
|
|
8
8
|
import pyspark
|
|
9
9
|
from databricks.sdk.service.sql import QueryStatementType
|
|
10
|
-
from sqllineage.runner import LineageRunner
|
|
11
10
|
|
|
12
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
12
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
|
@@ -22,7 +21,9 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
22
21
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
|
23
22
|
from datahub.ingestion.source.usage.usage_common import UsageAggregator
|
|
24
23
|
from datahub.metadata.schema_classes import OperationClass
|
|
24
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
25
25
|
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
26
|
+
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
26
27
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
28
29
|
|
|
@@ -48,6 +49,7 @@ class UnityCatalogUsageExtractor:
|
|
|
48
49
|
proxy: UnityCatalogApiProxy
|
|
49
50
|
table_urn_builder: Callable[[TableReference], str]
|
|
50
51
|
user_urn_builder: Callable[[str], str]
|
|
52
|
+
platform: str = "databricks"
|
|
51
53
|
|
|
52
54
|
def __post_init__(self):
|
|
53
55
|
self.usage_aggregator = UsageAggregator[TableReference](self.config)
|
|
@@ -173,7 +175,7 @@ class UnityCatalogUsageExtractor:
|
|
|
173
175
|
self, query: Query, table_map: TableMap
|
|
174
176
|
) -> Optional[QueryTableInfo]:
|
|
175
177
|
with self.report.usage_perf_report.sql_parsing_timer:
|
|
176
|
-
table_info = self.
|
|
178
|
+
table_info = self._parse_query_via_sqlglot(query.query_text)
|
|
177
179
|
if table_info is None and query.statement_type == QueryStatementType.SELECT:
|
|
178
180
|
with self.report.usage_perf_report.spark_sql_parsing_timer:
|
|
179
181
|
table_info = self._parse_query_via_spark_sql_plan(query.query_text)
|
|
@@ -191,26 +193,33 @@ class UnityCatalogUsageExtractor:
|
|
|
191
193
|
),
|
|
192
194
|
)
|
|
193
195
|
|
|
194
|
-
def
|
|
196
|
+
def _parse_query_via_sqlglot(self, query: str) -> Optional[StringTableInfo]:
|
|
195
197
|
try:
|
|
196
|
-
|
|
198
|
+
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
|
199
|
+
query=query,
|
|
200
|
+
default_db=None,
|
|
201
|
+
platform=self.platform,
|
|
202
|
+
env=self.config.env,
|
|
203
|
+
platform_instance=None,
|
|
204
|
+
)
|
|
205
|
+
|
|
197
206
|
return GenericTableInfo(
|
|
198
207
|
source_tables=[
|
|
199
|
-
self.
|
|
200
|
-
for table in
|
|
208
|
+
self._parse_sqlglot_table(table)
|
|
209
|
+
for table in sql_parser_in_tables.in_tables
|
|
201
210
|
],
|
|
202
211
|
target_tables=[
|
|
203
|
-
self.
|
|
204
|
-
for table in
|
|
212
|
+
self._parse_sqlglot_table(table)
|
|
213
|
+
for table in sql_parser_in_tables.out_tables
|
|
205
214
|
],
|
|
206
215
|
)
|
|
207
216
|
except Exception as e:
|
|
208
|
-
logger.info(f"Could not parse query via
|
|
217
|
+
logger.info(f"Could not parse query via sqlglot, {query}: {e!r}")
|
|
209
218
|
return None
|
|
210
219
|
|
|
211
220
|
@staticmethod
|
|
212
|
-
def
|
|
213
|
-
full_table_name =
|
|
221
|
+
def _parse_sqlglot_table(table_urn: str) -> str:
|
|
222
|
+
full_table_name = DatasetUrn.from_string(table_urn).name
|
|
214
223
|
default_schema = "<default>."
|
|
215
224
|
if full_table_name.startswith(default_schema):
|
|
216
225
|
return full_table_name[len(default_schema) :]
|
datahub/specific/datajob.py
CHANGED
|
@@ -102,7 +102,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
102
102
|
|
|
103
103
|
Notes:
|
|
104
104
|
If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
|
|
105
|
-
it is converted to an Edge object and added
|
|
105
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
106
106
|
"""
|
|
107
107
|
if isinstance(input, Edge):
|
|
108
108
|
input_urn: str = input.destinationUrn
|
|
@@ -114,8 +114,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
114
114
|
|
|
115
115
|
input_edge = Edge(
|
|
116
116
|
destinationUrn=input_urn,
|
|
117
|
-
created=self._mint_auditstamp(),
|
|
118
|
-
lastModified=self._mint_auditstamp(),
|
|
119
117
|
)
|
|
120
118
|
|
|
121
119
|
self._ensure_urn_type("dataJob", [input_edge], "add_input_datajob")
|
|
@@ -185,7 +183,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
185
183
|
|
|
186
184
|
Notes:
|
|
187
185
|
If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
|
|
188
|
-
it is converted to an Edge object and added
|
|
186
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
189
187
|
"""
|
|
190
188
|
if isinstance(input, Edge):
|
|
191
189
|
input_urn: str = input.destinationUrn
|
|
@@ -197,8 +195,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
197
195
|
|
|
198
196
|
input_edge = Edge(
|
|
199
197
|
destinationUrn=input_urn,
|
|
200
|
-
created=self._mint_auditstamp(),
|
|
201
|
-
lastModified=self._mint_auditstamp(),
|
|
202
198
|
)
|
|
203
199
|
|
|
204
200
|
self._ensure_urn_type("dataset", [input_edge], "add_input_dataset")
|
|
@@ -270,7 +266,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
270
266
|
|
|
271
267
|
Notes:
|
|
272
268
|
If `output` is an Edge object, it is used directly. If `output` is a Urn object or string,
|
|
273
|
-
it is converted to an Edge object and added
|
|
269
|
+
it is converted to an Edge object and added without any audit stamps.
|
|
274
270
|
"""
|
|
275
271
|
if isinstance(output, Edge):
|
|
276
272
|
output_urn: str = output.destinationUrn
|
|
@@ -282,15 +278,13 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
282
278
|
|
|
283
279
|
output_edge = Edge(
|
|
284
280
|
destinationUrn=output_urn,
|
|
285
|
-
created=self._mint_auditstamp(),
|
|
286
|
-
lastModified=self._mint_auditstamp(),
|
|
287
281
|
)
|
|
288
282
|
|
|
289
283
|
self._ensure_urn_type("dataset", [output_edge], "add_output_dataset")
|
|
290
284
|
self._add_patch(
|
|
291
285
|
DataJobInputOutput.ASPECT_NAME,
|
|
292
286
|
"add",
|
|
293
|
-
path=f"/outputDatasetEdges/{self.quote(
|
|
287
|
+
path=f"/outputDatasetEdges/{self.quote(output_urn)}",
|
|
294
288
|
value=output_edge,
|
|
295
289
|
)
|
|
296
290
|
return self
|
|
@@ -268,7 +268,7 @@ class BatchPartitionExecutor(Closeable):
|
|
|
268
268
|
self.process_batch = process_batch
|
|
269
269
|
self.min_process_interval = min_process_interval
|
|
270
270
|
self.read_from_pending_interval = read_from_pending_interval
|
|
271
|
-
assert self.max_workers
|
|
271
|
+
assert self.max_workers >= 1
|
|
272
272
|
|
|
273
273
|
self._state_lock = threading.Lock()
|
|
274
274
|
self._executor = ThreadPoolExecutor(
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
import contextlib
|
|
2
|
-
import logging
|
|
3
|
-
import re
|
|
4
|
-
import unittest
|
|
5
|
-
import unittest.mock
|
|
6
|
-
from typing import Dict, List, Optional, Set
|
|
7
|
-
|
|
8
|
-
from sqllineage.core.holders import Column, SQLLineageHolder
|
|
9
|
-
from sqllineage.exceptions import SQLLineageException
|
|
10
|
-
|
|
11
|
-
from datahub.utilities.sql_parser_base import SQLParser, SqlParserException
|
|
12
|
-
|
|
13
|
-
with contextlib.suppress(ImportError):
|
|
14
|
-
import sqlparse
|
|
15
|
-
from networkx import DiGraph
|
|
16
|
-
from sqllineage.core import LineageAnalyzer
|
|
17
|
-
|
|
18
|
-
import datahub.utilities.sqllineage_patch
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SqlLineageSQLParserImpl(SQLParser):
|
|
23
|
-
_DATE_SWAP_TOKEN = "__d_a_t_e"
|
|
24
|
-
_HOUR_SWAP_TOKEN = "__h_o_u_r"
|
|
25
|
-
_TIMESTAMP_SWAP_TOKEN = "__t_i_m_e_s_t_a_m_p"
|
|
26
|
-
_DATA_SWAP_TOKEN = "__d_a_t_a"
|
|
27
|
-
_ADMIN_SWAP_TOKEN = "__a_d_m_i_n"
|
|
28
|
-
_MYVIEW_SQL_TABLE_NAME_TOKEN = "__my_view__.__sql_table_name__"
|
|
29
|
-
_MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME"
|
|
30
|
-
|
|
31
|
-
def __init__(self, sql_query: str, use_raw_names: bool = False) -> None:
|
|
32
|
-
super().__init__(sql_query)
|
|
33
|
-
original_sql_query = sql_query
|
|
34
|
-
self._use_raw_names = use_raw_names
|
|
35
|
-
|
|
36
|
-
# SqlLineageParser makes mistakes on lateral flatten queries, use the prefix
|
|
37
|
-
if "lateral flatten" in sql_query:
|
|
38
|
-
sql_query = sql_query[: sql_query.find("lateral flatten")]
|
|
39
|
-
|
|
40
|
-
# Replace reserved words that break SqlLineageParser
|
|
41
|
-
self.token_to_original: Dict[str, str] = {
|
|
42
|
-
self._DATE_SWAP_TOKEN: "date",
|
|
43
|
-
self._HOUR_SWAP_TOKEN: "hour",
|
|
44
|
-
self._TIMESTAMP_SWAP_TOKEN: "timestamp",
|
|
45
|
-
self._DATA_SWAP_TOKEN: "data",
|
|
46
|
-
self._ADMIN_SWAP_TOKEN: "admin",
|
|
47
|
-
}
|
|
48
|
-
for replacement, original in self.token_to_original.items():
|
|
49
|
-
# Replace original tokens with replacement. Since table and column name can contain a hyphen('-'),
|
|
50
|
-
# also prevent original tokens appearing as part of these names with a hyphen from getting substituted.
|
|
51
|
-
sql_query = re.sub(
|
|
52
|
-
rf"((?<!-)\b{original}\b)(?!-)",
|
|
53
|
-
rf"{replacement}",
|
|
54
|
-
sql_query,
|
|
55
|
-
flags=re.IGNORECASE,
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
# SqlLineageParser lowercarese tablenames and we need to replace Looker specific token which should be uppercased
|
|
59
|
-
sql_query = re.sub(
|
|
60
|
-
rf"(\${{{self._MYVIEW_LOOKER_TOKEN}}})",
|
|
61
|
-
rf"{self._MYVIEW_SQL_TABLE_NAME_TOKEN}",
|
|
62
|
-
sql_query,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
# SqlLineageParser does not handle "encode" directives well. Remove them
|
|
66
|
-
sql_query = re.sub(r"\sencode [a-zA-Z]*", "", sql_query, flags=re.IGNORECASE)
|
|
67
|
-
|
|
68
|
-
# Replace lookml templates with the variable otherwise sqlparse can't parse ${
|
|
69
|
-
sql_query = re.sub(r"(\${)(.+)(})", r"\2", sql_query)
|
|
70
|
-
if sql_query != original_sql_query:
|
|
71
|
-
logger.debug(f"Rewrote original query {original_sql_query} as {sql_query}")
|
|
72
|
-
|
|
73
|
-
self._sql = sql_query
|
|
74
|
-
self._stmt_holders: Optional[List[LineageAnalyzer]] = None
|
|
75
|
-
self._sql_holder: Optional[SQLLineageHolder] = None
|
|
76
|
-
try:
|
|
77
|
-
self._stmt = [
|
|
78
|
-
s
|
|
79
|
-
for s in sqlparse.parse(
|
|
80
|
-
# first apply sqlparser formatting just to get rid of comments, which cause
|
|
81
|
-
# inconsistencies in parsing output
|
|
82
|
-
sqlparse.format(
|
|
83
|
-
self._sql.strip(),
|
|
84
|
-
strip_comments=True,
|
|
85
|
-
use_space_around_operators=True,
|
|
86
|
-
),
|
|
87
|
-
)
|
|
88
|
-
if s.token_first(skip_cm=True)
|
|
89
|
-
]
|
|
90
|
-
|
|
91
|
-
with unittest.mock.patch(
|
|
92
|
-
"sqllineage.core.handlers.source.SourceHandler.end_of_query_cleanup",
|
|
93
|
-
datahub.utilities.sqllineage_patch.end_of_query_cleanup_patch,
|
|
94
|
-
):
|
|
95
|
-
with unittest.mock.patch(
|
|
96
|
-
"sqllineage.core.holders.SubQueryLineageHolder.add_column_lineage",
|
|
97
|
-
datahub.utilities.sqllineage_patch.add_column_lineage_patch,
|
|
98
|
-
):
|
|
99
|
-
self._stmt_holders = [
|
|
100
|
-
LineageAnalyzer().analyze(stmt) for stmt in self._stmt
|
|
101
|
-
]
|
|
102
|
-
self._sql_holder = SQLLineageHolder.of(*self._stmt_holders)
|
|
103
|
-
except SQLLineageException as e:
|
|
104
|
-
raise SqlParserException(
|
|
105
|
-
f"SQL lineage analyzer error '{e}' for query: '{self._sql}"
|
|
106
|
-
) from e
|
|
107
|
-
|
|
108
|
-
def get_tables(self) -> List[str]:
|
|
109
|
-
result: List[str] = []
|
|
110
|
-
if self._sql_holder is None:
|
|
111
|
-
logger.error("sql holder not present so cannot get tables")
|
|
112
|
-
return result
|
|
113
|
-
for table in self._sql_holder.source_tables:
|
|
114
|
-
table_normalized = re.sub(
|
|
115
|
-
r"^<default>.",
|
|
116
|
-
"",
|
|
117
|
-
(
|
|
118
|
-
str(table)
|
|
119
|
-
if not self._use_raw_names
|
|
120
|
-
else f"{table.schema.raw_name}.{table.raw_name}"
|
|
121
|
-
),
|
|
122
|
-
)
|
|
123
|
-
result.append(str(table_normalized))
|
|
124
|
-
|
|
125
|
-
# We need to revert TOKEN replacements
|
|
126
|
-
for token, replacement in self.token_to_original.items():
|
|
127
|
-
result = [replacement if c == token else c for c in result]
|
|
128
|
-
result = [
|
|
129
|
-
self._MYVIEW_LOOKER_TOKEN if c == self._MYVIEW_SQL_TABLE_NAME_TOKEN else c
|
|
130
|
-
for c in result
|
|
131
|
-
]
|
|
132
|
-
|
|
133
|
-
# Sort tables to make the list deterministic
|
|
134
|
-
result.sort()
|
|
135
|
-
|
|
136
|
-
return result
|
|
137
|
-
|
|
138
|
-
def get_columns(self) -> List[str]:
|
|
139
|
-
if self._sql_holder is None:
|
|
140
|
-
raise SqlParserException("sql holder not present so cannot get columns")
|
|
141
|
-
graph: DiGraph = self._sql_holder.graph # For mypy attribute checking
|
|
142
|
-
column_nodes = [n for n in graph.nodes if isinstance(n, Column)]
|
|
143
|
-
column_graph = graph.subgraph(column_nodes)
|
|
144
|
-
|
|
145
|
-
target_columns = {column for column, deg in column_graph.out_degree if deg == 0}
|
|
146
|
-
|
|
147
|
-
result: Set[str] = set()
|
|
148
|
-
for column in target_columns:
|
|
149
|
-
# Let's drop all the count(*) and similard columns which are expression actually if it does not have an alias
|
|
150
|
-
if not any(ele in column.raw_name for ele in ["*", "(", ")"]):
|
|
151
|
-
result.add(str(column.raw_name))
|
|
152
|
-
|
|
153
|
-
# Reverting back all the previously renamed words which confuses the parser
|
|
154
|
-
result = {"date" if c == self._DATE_SWAP_TOKEN else c for c in result}
|
|
155
|
-
result = {
|
|
156
|
-
"timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result)
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
# swap back renamed date column
|
|
160
|
-
return list(result)
|