acryl-datahub 1.1.0.5rc7__py3-none-any.whl → 1.1.0.5rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/METADATA +2620 -2622
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/RECORD +59 -59
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/autogenerated/capability_summary.json +88 -23
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +43 -0
- datahub/ingestion/source/bigquery_v2/bigquery.py +9 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +1 -4
- datahub/ingestion/source/gcs/gcs_source.py +9 -1
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +100 -58
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +1 -1
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +8 -0
- datahub/ingestion/source/sql/teradata.py +993 -234
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/source.py +9 -1
- datahub/sdk/lineage_client.py +2 -2
- datahub/sql_parsing/sql_parsing_aggregator.py +21 -12
- datahub/sql_parsing/sqlglot_lineage.py +40 -15
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/top_level.txt +0 -0
|
@@ -45,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
|
45
45
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
46
46
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
47
47
|
from datahub.metadata.schema_classes import (
|
|
48
|
-
ChangeTypeClass,
|
|
49
48
|
DatasetLineageTypeClass,
|
|
50
49
|
DatasetPropertiesClass,
|
|
51
50
|
SubTypesClass,
|
|
@@ -501,10 +500,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
501
500
|
if dpi_aspect:
|
|
502
501
|
yield dpi_aspect
|
|
503
502
|
yield MetadataChangeProposalWrapper(
|
|
504
|
-
entityType="dataset",
|
|
505
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
506
503
|
entityUrn=dataset_urn,
|
|
507
|
-
aspectName="subTypes",
|
|
508
504
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
509
505
|
).as_workunit()
|
|
510
506
|
|
|
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
66
66
|
description="The default schema to use for unqualified table names",
|
|
67
67
|
default=None,
|
|
68
68
|
)
|
|
69
|
-
|
|
69
|
+
override_dialect: Optional[str] = Field(
|
|
70
70
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
71
71
|
default=None,
|
|
72
72
|
)
|
|
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
|
|
|
181
181
|
schema_resolver=self.schema_resolver,
|
|
182
182
|
default_db=self.config.default_db,
|
|
183
183
|
default_schema=self.config.default_schema,
|
|
184
|
-
|
|
184
|
+
override_dialect=self.config.override_dialect,
|
|
185
185
|
)
|
|
186
186
|
if result.debug_info.table_error:
|
|
187
187
|
logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
|
|
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
658
658
|
if datasource_id:
|
|
659
659
|
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
660
660
|
dataset_column_info = dataset_info.get("columns", [])
|
|
661
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
661
662
|
|
|
662
663
|
for column in dataset_column_info:
|
|
663
664
|
col_name = column.get("column_name", "")
|
|
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
671
672
|
continue
|
|
672
673
|
|
|
673
674
|
dataset_columns.append((col_name, col_type, col_description))
|
|
675
|
+
|
|
676
|
+
for metric in dataset_metric_info:
|
|
677
|
+
metric_name = metric.get("metric_name", "")
|
|
678
|
+
metric_type = metric.get("metric_type", "")
|
|
679
|
+
metric_description = metric.get("description", "")
|
|
680
|
+
|
|
681
|
+
if metric_name == "" or metric_type == "":
|
|
682
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
683
|
+
continue
|
|
684
|
+
|
|
685
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
674
686
|
else:
|
|
675
687
|
# if no datasource id, cannot build cll, just return
|
|
676
688
|
logger.warning(
|
|
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
972
984
|
schema_fields.append(field)
|
|
973
985
|
return schema_fields
|
|
974
986
|
|
|
987
|
+
def gen_metric_schema_fields(
|
|
988
|
+
self, metric_data: List[Dict[str, Any]]
|
|
989
|
+
) -> List[SchemaField]:
|
|
990
|
+
schema_fields: List[SchemaField] = []
|
|
991
|
+
for metric in metric_data:
|
|
992
|
+
metric_type = metric.get("metric_type", "")
|
|
993
|
+
data_type = resolve_sql_type(metric_type)
|
|
994
|
+
if data_type is None:
|
|
995
|
+
data_type = NullType()
|
|
996
|
+
|
|
997
|
+
field = SchemaField(
|
|
998
|
+
fieldPath=metric.get("metric_name", ""),
|
|
999
|
+
type=SchemaFieldDataType(data_type),
|
|
1000
|
+
nativeDataType=metric_type or "",
|
|
1001
|
+
description=metric.get("description", ""),
|
|
1002
|
+
nullable=True,
|
|
1003
|
+
)
|
|
1004
|
+
schema_fields.append(field)
|
|
1005
|
+
return schema_fields
|
|
1006
|
+
|
|
975
1007
|
def gen_schema_metadata(
|
|
976
1008
|
self,
|
|
977
1009
|
dataset_response: dict,
|
|
978
1010
|
) -> SchemaMetadata:
|
|
979
1011
|
dataset_response = dataset_response.get("result", {})
|
|
980
1012
|
column_data = dataset_response.get("columns", [])
|
|
1013
|
+
metric_data = dataset_response.get("metrics", [])
|
|
1014
|
+
|
|
1015
|
+
column_fields = self.gen_schema_fields(column_data)
|
|
1016
|
+
metric_fields = self.gen_metric_schema_fields(metric_data)
|
|
1017
|
+
|
|
981
1018
|
schema_metadata = SchemaMetadata(
|
|
982
1019
|
schemaName=dataset_response.get("table_name", ""),
|
|
983
1020
|
platform=make_data_platform_urn(self.platform),
|
|
984
1021
|
version=0,
|
|
985
1022
|
hash="",
|
|
986
1023
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
987
|
-
fields=
|
|
1024
|
+
fields=column_fields + metric_fields,
|
|
988
1025
|
)
|
|
989
1026
|
return schema_metadata
|
|
990
1027
|
|
|
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1049
1086
|
# To generate column level lineage, we can manually decode the metadata
|
|
1050
1087
|
# to produce the ColumnLineageInfo
|
|
1051
1088
|
columns = dataset_response.get("result", {}).get("columns", [])
|
|
1089
|
+
metrics = dataset_response.get("result", {}).get("metrics", [])
|
|
1090
|
+
|
|
1052
1091
|
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1053
1092
|
|
|
1054
1093
|
for column in columns:
|
|
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1067
1106
|
)
|
|
1068
1107
|
)
|
|
1069
1108
|
|
|
1109
|
+
for metric in metrics:
|
|
1110
|
+
metric_name = metric.get("metric_name", "")
|
|
1111
|
+
if not metric_name:
|
|
1112
|
+
continue
|
|
1113
|
+
|
|
1114
|
+
downstream = [make_schema_field_urn(datasource_urn, metric_name)]
|
|
1115
|
+
upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
|
|
1116
|
+
fine_grained_lineages.append(
|
|
1117
|
+
FineGrainedLineageClass(
|
|
1118
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1119
|
+
downstreams=downstream,
|
|
1120
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1121
|
+
upstreams=upstreams,
|
|
1122
|
+
)
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1070
1125
|
upstream_lineage = UpstreamLineageClass(
|
|
1071
1126
|
upstreams=[
|
|
1072
1127
|
UpstreamClass(
|
|
@@ -149,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
149
149
|
)
|
|
150
150
|
from datahub.metadata.schema_classes import (
|
|
151
151
|
BrowsePathsClass,
|
|
152
|
-
ChangeTypeClass,
|
|
153
152
|
ChartInfoClass,
|
|
154
153
|
ChartUsageStatisticsClass,
|
|
155
154
|
DashboardInfoClass,
|
|
@@ -529,6 +528,14 @@ class TableauConfig(
|
|
|
529
528
|
default=False,
|
|
530
529
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
531
530
|
)
|
|
531
|
+
emit_all_published_datasources: bool = Field(
|
|
532
|
+
default=False,
|
|
533
|
+
description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
|
|
534
|
+
)
|
|
535
|
+
emit_all_embedded_datasources: bool = Field(
|
|
536
|
+
default=False,
|
|
537
|
+
description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
|
|
538
|
+
)
|
|
532
539
|
|
|
533
540
|
env: str = Field(
|
|
534
541
|
default=builder.DEFAULT_ENV,
|
|
@@ -2180,32 +2187,32 @@ class TableauSiteSource:
|
|
|
2180
2187
|
else []
|
|
2181
2188
|
)
|
|
2182
2189
|
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2190
|
+
tableau_table_list = csql.get(c.TABLES, [])
|
|
2191
|
+
if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
|
|
2192
|
+
not tableau_table_list
|
|
2193
|
+
and self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2194
|
+
):
|
|
2195
|
+
if not tableau_table_list:
|
|
2196
|
+
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2197
|
+
# we extract the lineage from the raw queries
|
|
2198
|
+
logger.debug(
|
|
2199
|
+
"Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
|
|
2200
|
+
)
|
|
2201
|
+
else:
|
|
2202
|
+
# The Tableau SQL parser is much worse than our sqlglot based parser,
|
|
2203
|
+
# so relying on metadata parsed by Tableau from SQL queries can be
|
|
2204
|
+
# less accurate. This option allows us to ignore Tableau's parser and
|
|
2205
|
+
# only use our own.
|
|
2206
|
+
logger.debug("Parsing TLL & CLL from custom sql (forced)")
|
|
2207
|
+
|
|
2189
2208
|
yield from self._create_lineage_from_unsupported_csql(
|
|
2190
2209
|
csql_urn, csql, columns
|
|
2191
2210
|
)
|
|
2192
2211
|
else:
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
yield from self._create_lineage_to_upstream_tables(
|
|
2198
|
-
csql_urn, tables, datasource
|
|
2199
|
-
)
|
|
2200
|
-
elif (
|
|
2201
|
-
self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2202
|
-
):
|
|
2203
|
-
logger.debug("Extracting TLL & CLL from custom sql")
|
|
2204
|
-
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2205
|
-
# we extract the lineage from the raw queries
|
|
2206
|
-
yield from self._create_lineage_from_unsupported_csql(
|
|
2207
|
-
csql_urn, csql, columns
|
|
2208
|
-
)
|
|
2212
|
+
# lineage from custom sql -> datasets/tables #
|
|
2213
|
+
yield from self._create_lineage_to_upstream_tables(
|
|
2214
|
+
csql_urn, tableau_table_list, datasource
|
|
2215
|
+
)
|
|
2209
2216
|
|
|
2210
2217
|
# Schema Metadata
|
|
2211
2218
|
schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
|
|
@@ -2243,7 +2250,6 @@ class TableauSiteSource:
|
|
|
2243
2250
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2244
2251
|
yield self.get_metadata_change_proposal(
|
|
2245
2252
|
dataset_snapshot.urn,
|
|
2246
|
-
aspect_name=c.SUB_TYPES,
|
|
2247
2253
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
|
|
2248
2254
|
)
|
|
2249
2255
|
|
|
@@ -2408,7 +2414,6 @@ class TableauSiteSource:
|
|
|
2408
2414
|
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
|
|
2409
2415
|
yield self.get_metadata_change_proposal(
|
|
2410
2416
|
csql_urn,
|
|
2411
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2412
2417
|
aspect=upstream_lineage,
|
|
2413
2418
|
)
|
|
2414
2419
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2594,7 +2599,6 @@ class TableauSiteSource:
|
|
|
2594
2599
|
)
|
|
2595
2600
|
yield self.get_metadata_change_proposal(
|
|
2596
2601
|
csql_urn,
|
|
2597
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2598
2602
|
aspect=upstream_lineage,
|
|
2599
2603
|
)
|
|
2600
2604
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2640,14 +2644,10 @@ class TableauSiteSource:
|
|
|
2640
2644
|
def get_metadata_change_proposal(
|
|
2641
2645
|
self,
|
|
2642
2646
|
urn: str,
|
|
2643
|
-
aspect_name: str,
|
|
2644
2647
|
aspect: Union["UpstreamLineage", "SubTypesClass"],
|
|
2645
2648
|
) -> MetadataWorkUnit:
|
|
2646
2649
|
return MetadataChangeProposalWrapper(
|
|
2647
|
-
entityType=c.DATASET,
|
|
2648
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
2649
2650
|
entityUrn=urn,
|
|
2650
|
-
aspectName=aspect_name,
|
|
2651
2651
|
aspect=aspect,
|
|
2652
2652
|
).as_workunit()
|
|
2653
2653
|
|
|
@@ -2755,7 +2755,6 @@ class TableauSiteSource:
|
|
|
2755
2755
|
)
|
|
2756
2756
|
yield self.get_metadata_change_proposal(
|
|
2757
2757
|
datasource_urn,
|
|
2758
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2759
2758
|
aspect=upstream_lineage,
|
|
2760
2759
|
)
|
|
2761
2760
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2774,7 +2773,6 @@ class TableauSiteSource:
|
|
|
2774
2773
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2775
2774
|
yield self.get_metadata_change_proposal(
|
|
2776
2775
|
dataset_snapshot.urn,
|
|
2777
|
-
aspect_name=c.SUB_TYPES,
|
|
2778
2776
|
aspect=SubTypesClass(
|
|
2779
2777
|
typeNames=(
|
|
2780
2778
|
["Embedded Data Source"]
|
|
@@ -2860,7 +2858,11 @@ class TableauSiteSource:
|
|
|
2860
2858
|
return datasource
|
|
2861
2859
|
|
|
2862
2860
|
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
2863
|
-
datasource_filter =
|
|
2861
|
+
datasource_filter = (
|
|
2862
|
+
{}
|
|
2863
|
+
if self.config.emit_all_published_datasources
|
|
2864
|
+
else {c.ID_WITH_IN: self.datasource_ids_being_used}
|
|
2865
|
+
)
|
|
2864
2866
|
|
|
2865
2867
|
for datasource in self.get_connection_objects(
|
|
2866
2868
|
query=published_datasource_graphql_query,
|
|
@@ -3553,7 +3555,11 @@ class TableauSiteSource:
|
|
|
3553
3555
|
return browse_paths
|
|
3554
3556
|
|
|
3555
3557
|
def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
3556
|
-
datasource_filter =
|
|
3558
|
+
datasource_filter = (
|
|
3559
|
+
{}
|
|
3560
|
+
if self.config.emit_all_embedded_datasources
|
|
3561
|
+
else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
|
|
3562
|
+
)
|
|
3557
3563
|
|
|
3558
3564
|
for datasource in self.get_connection_objects(
|
|
3559
3565
|
query=embedded_datasource_graphql_query,
|
|
@@ -50,7 +50,6 @@ TABLES = "tables"
|
|
|
50
50
|
DESCRIPTION = "description"
|
|
51
51
|
SQL = "SQL"
|
|
52
52
|
QUERY = "query"
|
|
53
|
-
SUB_TYPES = "subTypes"
|
|
54
53
|
VIEW = "view"
|
|
55
54
|
CUSTOM_SQL = "Custom SQL"
|
|
56
55
|
REMOTE_TYPE = "remoteType"
|
|
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
|
|
|
58
57
|
PUBLISHED_DATA_SOURCE = "PublishedDatasource"
|
|
59
58
|
LUID = "luid"
|
|
60
59
|
EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
|
|
61
|
-
UPSTREAM_LINEAGE = "upstreamLineage"
|
|
62
60
|
OWNER = "owner"
|
|
63
61
|
USERNAME = "username"
|
|
64
62
|
HAS_EXTRACTS = "hasExtracts"
|
|
@@ -56,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
56
56
|
from datahub.ingestion.source.common.subtypes import (
|
|
57
57
|
DatasetContainerSubTypes,
|
|
58
58
|
DatasetSubTypes,
|
|
59
|
+
SourceCapabilityModifier,
|
|
59
60
|
)
|
|
60
61
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
61
62
|
StaleEntityRemovalHandler,
|
|
@@ -152,7 +153,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
152
153
|
@capability(SourceCapability.USAGE_STATS, "Enabled by default")
|
|
153
154
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
154
155
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
155
|
-
@capability(
|
|
156
|
+
@capability(
|
|
157
|
+
SourceCapability.CONTAINERS,
|
|
158
|
+
"Enabled by default",
|
|
159
|
+
subtype_modifier=[
|
|
160
|
+
SourceCapabilityModifier.CATALOG,
|
|
161
|
+
SourceCapabilityModifier.SCHEMA,
|
|
162
|
+
],
|
|
163
|
+
)
|
|
156
164
|
@capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
|
|
157
165
|
@capability(
|
|
158
166
|
SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
|
datahub/sdk/lineage_client.py
CHANGED
|
@@ -478,7 +478,7 @@ class LineageClient:
|
|
|
478
478
|
env: str = "PROD",
|
|
479
479
|
default_db: Optional[str] = None,
|
|
480
480
|
default_schema: Optional[str] = None,
|
|
481
|
-
|
|
481
|
+
override_dialect: Optional[str] = None,
|
|
482
482
|
) -> None:
|
|
483
483
|
"""Add lineage by parsing a SQL query."""
|
|
484
484
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -494,7 +494,7 @@ class LineageClient:
|
|
|
494
494
|
platform_instance=platform_instance,
|
|
495
495
|
env=env,
|
|
496
496
|
graph=self._client._graph,
|
|
497
|
-
|
|
497
|
+
override_dialect=override_dialect,
|
|
498
498
|
)
|
|
499
499
|
|
|
500
500
|
if parsed_result.debug_info.table_error:
|
|
@@ -1577,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1577
1577
|
|
|
1578
1578
|
@dataclasses.dataclass
|
|
1579
1579
|
class QueryLineageInfo:
|
|
1580
|
-
upstreams:
|
|
1581
|
-
|
|
1580
|
+
upstreams: OrderedSet[
|
|
1581
|
+
UrnStr
|
|
1582
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1583
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1582
1584
|
confidence_score: float
|
|
1583
1585
|
|
|
1584
1586
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1585
|
-
self.upstreams
|
|
1586
|
-
self.column_lineage
|
|
1587
|
+
self.upstreams.update(other_query.upstreams)
|
|
1588
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1587
1589
|
self.confidence_score = min(
|
|
1588
1590
|
self.confidence_score, other_query.confidence_score
|
|
1589
1591
|
)
|
|
1590
1592
|
|
|
1593
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1594
|
+
|
|
1591
1595
|
def _recurse_into_query(
|
|
1592
1596
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1593
1597
|
) -> QueryLineageInfo:
|
|
1594
1598
|
if query.query_id in recursion_path:
|
|
1595
1599
|
# This is a cycle, so we just return the query as-is.
|
|
1596
1600
|
return QueryLineageInfo(
|
|
1597
|
-
upstreams=query.upstreams,
|
|
1598
|
-
column_lineage=query.column_lineage,
|
|
1601
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1602
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1599
1603
|
confidence_score=query.confidence_score,
|
|
1600
1604
|
)
|
|
1605
|
+
if query.query_id in cache:
|
|
1606
|
+
return cache[query.query_id]
|
|
1601
1607
|
recursion_path = [*recursion_path, query.query_id]
|
|
1602
1608
|
composed_of_queries.add(query.query_id)
|
|
1603
1609
|
|
|
@@ -1612,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1612
1618
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1613
1619
|
if (
|
|
1614
1620
|
upstream_query
|
|
1615
|
-
and upstream_query.query_id not in
|
|
1621
|
+
and upstream_query.query_id not in recursion_path
|
|
1616
1622
|
):
|
|
1617
1623
|
temp_query_lineage_info = _recurse_into_query(
|
|
1618
1624
|
upstream_query, recursion_path
|
|
@@ -1672,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1672
1678
|
]
|
|
1673
1679
|
)
|
|
1674
1680
|
|
|
1675
|
-
|
|
1676
|
-
upstreams=
|
|
1677
|
-
column_lineage=new_cll,
|
|
1681
|
+
ret = QueryLineageInfo(
|
|
1682
|
+
upstreams=new_upstreams,
|
|
1683
|
+
column_lineage=OrderedSet(new_cll),
|
|
1678
1684
|
confidence_score=new_confidence_score,
|
|
1679
1685
|
)
|
|
1686
|
+
cache[query.query_id] = ret
|
|
1687
|
+
|
|
1688
|
+
return ret
|
|
1680
1689
|
|
|
1681
1690
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1682
1691
|
|
|
@@ -1716,8 +1725,8 @@ class SqlParsingAggregator(Closeable):
|
|
|
1716
1725
|
base_query,
|
|
1717
1726
|
query_id=composite_query_id,
|
|
1718
1727
|
formatted_query_string=merged_query_text,
|
|
1719
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1720
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1728
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1729
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1721
1730
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1722
1731
|
)
|
|
1723
1732
|
|
|
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
|
|
|
56
56
|
QueryTypeProps,
|
|
57
57
|
)
|
|
58
58
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
59
|
+
DialectOrStr,
|
|
59
60
|
get_dialect,
|
|
60
61
|
get_query_fingerprint_debug,
|
|
61
62
|
is_dialect_instance,
|
|
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
|
|
|
124
125
|
|
|
125
126
|
|
|
126
127
|
class DownstreamColumnRef(_ParserBaseModel):
|
|
128
|
+
"""
|
|
129
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
130
|
+
What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
|
|
131
|
+
auto-generated class from .pdl model files. We need generic solution allowing us to either:
|
|
132
|
+
1. Implement hashing for .pdl model objects
|
|
133
|
+
2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
|
|
134
|
+
hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
|
|
135
|
+
to understand that instruction as well.
|
|
136
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
137
|
+
"""
|
|
138
|
+
|
|
127
139
|
table: Optional[Urn] = None
|
|
128
140
|
column: str
|
|
129
141
|
column_type: Optional[SchemaFieldDataTypeClass] = None
|
|
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
|
|
|
139
151
|
return v
|
|
140
152
|
return SchemaFieldDataTypeClass.from_obj(v)
|
|
141
153
|
|
|
154
|
+
def __hash__(self) -> int:
|
|
155
|
+
return hash((self.table, self.column, self.native_column_type))
|
|
142
156
|
|
|
143
|
-
|
|
157
|
+
|
|
158
|
+
class ColumnTransformation(_FrozenModel):
|
|
144
159
|
is_direct_copy: bool
|
|
145
160
|
column_logic: str
|
|
146
161
|
|
|
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
|
|
|
153
168
|
|
|
154
169
|
|
|
155
170
|
class ColumnLineageInfo(_ParserBaseModel):
|
|
171
|
+
"""
|
|
172
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
173
|
+
To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
|
|
174
|
+
depending on it.
|
|
175
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
176
|
+
"""
|
|
177
|
+
|
|
156
178
|
downstream: DownstreamColumnRef
|
|
157
179
|
upstreams: List[ColumnRef]
|
|
158
180
|
|
|
159
181
|
logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
|
|
160
182
|
|
|
183
|
+
def __hash__(self) -> int:
|
|
184
|
+
return hash((self.downstream, tuple(self.upstreams), self.logic))
|
|
185
|
+
|
|
161
186
|
|
|
162
187
|
class _JoinInfo(_ParserBaseModel):
|
|
163
188
|
join_type: str
|
|
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
|
|
|
1231
1256
|
schema_resolver: SchemaResolverInterface,
|
|
1232
1257
|
default_db: Optional[str] = None,
|
|
1233
1258
|
default_schema: Optional[str] = None,
|
|
1234
|
-
|
|
1259
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1235
1260
|
) -> SqlParsingResult:
|
|
1236
|
-
if
|
|
1237
|
-
dialect = get_dialect(
|
|
1261
|
+
if override_dialect:
|
|
1262
|
+
dialect = get_dialect(override_dialect)
|
|
1238
1263
|
else:
|
|
1239
|
-
dialect = get_dialect(
|
|
1264
|
+
dialect = get_dialect(schema_resolver.platform)
|
|
1240
1265
|
|
|
1241
1266
|
default_db = _normalize_db_or_schema(default_db, dialect)
|
|
1242
1267
|
default_schema = _normalize_db_or_schema(default_schema, dialect)
|
|
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1423
1448
|
schema_resolver: SchemaResolverInterface,
|
|
1424
1449
|
default_db: Optional[str] = None,
|
|
1425
1450
|
default_schema: Optional[str] = None,
|
|
1426
|
-
|
|
1451
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1427
1452
|
) -> SqlParsingResult:
|
|
1428
1453
|
"""Parse a SQL statement and generate lineage information.
|
|
1429
1454
|
|
|
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
|
|
|
1441
1466
|
can be brittle with respect to missing schema information and complex
|
|
1442
1467
|
SQL logic like UNNESTs.
|
|
1443
1468
|
|
|
1444
|
-
The SQL dialect
|
|
1445
|
-
be
|
|
1469
|
+
The SQL dialect will be inferred from the schema_resolver's platform.
|
|
1470
|
+
That inference can be overridden by passing an override_dialect argument.
|
|
1446
1471
|
The set of supported dialects is the same as sqlglot's. See their
|
|
1447
1472
|
`documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
|
|
1448
1473
|
for the full list.
|
|
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1457
1482
|
schema_resolver: The schema resolver to use for resolving table schemas.
|
|
1458
1483
|
default_db: The default database to use for unqualified table names.
|
|
1459
1484
|
default_schema: The default schema to use for unqualified table names.
|
|
1460
|
-
|
|
1485
|
+
override_dialect: Override the dialect provided by 'schema_resolver'.
|
|
1461
1486
|
|
|
1462
1487
|
Returns:
|
|
1463
1488
|
A SqlParsingResult object containing the parsed lineage information.
|
|
@@ -1482,7 +1507,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1482
1507
|
schema_resolver=schema_resolver,
|
|
1483
1508
|
default_db=default_db,
|
|
1484
1509
|
default_schema=default_schema,
|
|
1485
|
-
|
|
1510
|
+
override_dialect=override_dialect,
|
|
1486
1511
|
)
|
|
1487
1512
|
except Exception as e:
|
|
1488
1513
|
return SqlParsingResult.make_from_error(e)
|
|
@@ -1520,15 +1545,15 @@ def sqlglot_lineage(
|
|
|
1520
1545
|
schema_resolver: SchemaResolverInterface,
|
|
1521
1546
|
default_db: Optional[str] = None,
|
|
1522
1547
|
default_schema: Optional[str] = None,
|
|
1523
|
-
|
|
1548
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1524
1549
|
) -> SqlParsingResult:
|
|
1525
1550
|
if schema_resolver.includes_temp_tables():
|
|
1526
1551
|
return _sqlglot_lineage_nocache(
|
|
1527
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1552
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1528
1553
|
)
|
|
1529
1554
|
else:
|
|
1530
1555
|
return _sqlglot_lineage_cached(
|
|
1531
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1556
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1532
1557
|
)
|
|
1533
1558
|
|
|
1534
1559
|
|
|
@@ -1580,7 +1605,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1580
1605
|
default_schema: Optional[str] = None,
|
|
1581
1606
|
graph: Optional[DataHubGraph] = None,
|
|
1582
1607
|
schema_aware: bool = True,
|
|
1583
|
-
|
|
1608
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1584
1609
|
) -> SqlParsingResult:
|
|
1585
1610
|
schema_resolver = create_schema_resolver(
|
|
1586
1611
|
platform=platform,
|
|
@@ -1600,7 +1625,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1600
1625
|
schema_resolver=schema_resolver,
|
|
1601
1626
|
default_db=default_db,
|
|
1602
1627
|
default_schema=default_schema,
|
|
1603
|
-
|
|
1628
|
+
override_dialect=override_dialect,
|
|
1604
1629
|
)
|
|
1605
1630
|
except Exception as e:
|
|
1606
1631
|
return SqlParsingResult.make_from_error(e)
|