acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3431 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +47 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/source.py +19 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +19 -9
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
|
@@ -14127,26 +14127,39 @@ class CorpUserHomePageSettingsClass(DictWrapper):
|
|
|
14127
14127
|
|
|
14128
14128
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.identity.CorpUserHomePageSettings")
|
|
14129
14129
|
def __init__(self,
|
|
14130
|
-
pageTemplate: str,
|
|
14130
|
+
pageTemplate: Union[None, str]=None,
|
|
14131
|
+
dismissedAnnouncements: Union[None, List[str]]=None,
|
|
14131
14132
|
):
|
|
14132
14133
|
super().__init__()
|
|
14133
14134
|
|
|
14134
14135
|
self.pageTemplate = pageTemplate
|
|
14136
|
+
self.dismissedAnnouncements = dismissedAnnouncements
|
|
14135
14137
|
|
|
14136
14138
|
def _restore_defaults(self) -> None:
|
|
14137
|
-
self.pageTemplate =
|
|
14139
|
+
self.pageTemplate = self.RECORD_SCHEMA.fields_dict["pageTemplate"].default
|
|
14140
|
+
self.dismissedAnnouncements = self.RECORD_SCHEMA.fields_dict["dismissedAnnouncements"].default
|
|
14138
14141
|
|
|
14139
14142
|
|
|
14140
14143
|
@property
|
|
14141
|
-
def pageTemplate(self) -> str:
|
|
14144
|
+
def pageTemplate(self) -> Union[None, str]:
|
|
14142
14145
|
"""The page template that will be rendered in the UI by default for this user"""
|
|
14143
14146
|
return self._inner_dict.get('pageTemplate') # type: ignore
|
|
14144
14147
|
|
|
14145
14148
|
@pageTemplate.setter
|
|
14146
|
-
def pageTemplate(self, value: str) -> None:
|
|
14149
|
+
def pageTemplate(self, value: Union[None, str]) -> None:
|
|
14147
14150
|
self._inner_dict['pageTemplate'] = value
|
|
14148
14151
|
|
|
14149
14152
|
|
|
14153
|
+
@property
|
|
14154
|
+
def dismissedAnnouncements(self) -> Union[None, List[str]]:
|
|
14155
|
+
"""The list of announcement urns that have been dismissed by the user"""
|
|
14156
|
+
return self._inner_dict.get('dismissedAnnouncements') # type: ignore
|
|
14157
|
+
|
|
14158
|
+
@dismissedAnnouncements.setter
|
|
14159
|
+
def dismissedAnnouncements(self, value: Union[None, List[str]]) -> None:
|
|
14160
|
+
self._inner_dict['dismissedAnnouncements'] = value
|
|
14161
|
+
|
|
14162
|
+
|
|
14150
14163
|
class CorpUserInfoClass(_Aspect):
|
|
14151
14164
|
"""Linkedin corp user information"""
|
|
14152
14165
|
|
|
@@ -24394,6 +24407,59 @@ class DataHubSecretValueClass(_Aspect):
|
|
|
24394
24407
|
self._inner_dict['created'] = value
|
|
24395
24408
|
|
|
24396
24409
|
|
|
24410
|
+
class ApplicationsSettingsClass(DictWrapper):
|
|
24411
|
+
# No docs available.
|
|
24412
|
+
|
|
24413
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.settings.global.ApplicationsSettings")
|
|
24414
|
+
def __init__(self,
|
|
24415
|
+
enabled: bool,
|
|
24416
|
+
config: Union[None, str]=None,
|
|
24417
|
+
configVersion: Union[None, str]=None,
|
|
24418
|
+
):
|
|
24419
|
+
super().__init__()
|
|
24420
|
+
|
|
24421
|
+
self.enabled = enabled
|
|
24422
|
+
self.config = config
|
|
24423
|
+
self.configVersion = configVersion
|
|
24424
|
+
|
|
24425
|
+
def _restore_defaults(self) -> None:
|
|
24426
|
+
self.enabled = bool()
|
|
24427
|
+
self.config = self.RECORD_SCHEMA.fields_dict["config"].default
|
|
24428
|
+
self.configVersion = self.RECORD_SCHEMA.fields_dict["configVersion"].default
|
|
24429
|
+
|
|
24430
|
+
|
|
24431
|
+
@property
|
|
24432
|
+
def enabled(self) -> bool:
|
|
24433
|
+
# No docs available.
|
|
24434
|
+
return self._inner_dict.get('enabled') # type: ignore
|
|
24435
|
+
|
|
24436
|
+
@enabled.setter
|
|
24437
|
+
def enabled(self, value: bool) -> None:
|
|
24438
|
+
self._inner_dict['enabled'] = value
|
|
24439
|
+
|
|
24440
|
+
|
|
24441
|
+
@property
|
|
24442
|
+
def config(self) -> Union[None, str]:
|
|
24443
|
+
"""The configuration for the feature, in JSON format."""
|
|
24444
|
+
return self._inner_dict.get('config') # type: ignore
|
|
24445
|
+
|
|
24446
|
+
@config.setter
|
|
24447
|
+
def config(self, value: Union[None, str]) -> None:
|
|
24448
|
+
self._inner_dict['config'] = value
|
|
24449
|
+
|
|
24450
|
+
|
|
24451
|
+
@property
|
|
24452
|
+
def configVersion(self) -> Union[None, str]:
|
|
24453
|
+
"""The version of the configuration schema that has been used to serialize
|
|
24454
|
+
the config.
|
|
24455
|
+
If not provided, the version is assumed to be the latest version."""
|
|
24456
|
+
return self._inner_dict.get('configVersion') # type: ignore
|
|
24457
|
+
|
|
24458
|
+
@configVersion.setter
|
|
24459
|
+
def configVersion(self, value: Union[None, str]) -> None:
|
|
24460
|
+
self._inner_dict['configVersion'] = value
|
|
24461
|
+
|
|
24462
|
+
|
|
24397
24463
|
class DocPropagationFeatureSettingsClass(DictWrapper):
|
|
24398
24464
|
# No docs available.
|
|
24399
24465
|
|
|
@@ -24502,6 +24568,7 @@ class GlobalSettingsInfoClass(_Aspect):
|
|
|
24502
24568
|
views: Union[None, "GlobalViewsSettingsClass"]=None,
|
|
24503
24569
|
docPropagation: Optional[Union["DocPropagationFeatureSettingsClass", None]]=None,
|
|
24504
24570
|
homePage: Union[None, "GlobalHomePageSettingsClass"]=None,
|
|
24571
|
+
applications: Union[None, "ApplicationsSettingsClass"]=None,
|
|
24505
24572
|
):
|
|
24506
24573
|
super().__init__()
|
|
24507
24574
|
|
|
@@ -24513,12 +24580,14 @@ class GlobalSettingsInfoClass(_Aspect):
|
|
|
24513
24580
|
else:
|
|
24514
24581
|
self.docPropagation = docPropagation
|
|
24515
24582
|
self.homePage = homePage
|
|
24583
|
+
self.applications = applications
|
|
24516
24584
|
|
|
24517
24585
|
def _restore_defaults(self) -> None:
|
|
24518
24586
|
self.sso = self.RECORD_SCHEMA.fields_dict["sso"].default
|
|
24519
24587
|
self.views = self.RECORD_SCHEMA.fields_dict["views"].default
|
|
24520
24588
|
self.docPropagation = _json_converter.from_json_object(self.RECORD_SCHEMA.fields_dict["docPropagation"].default, writers_schema=self.RECORD_SCHEMA.fields_dict["docPropagation"].type)
|
|
24521
24589
|
self.homePage = self.RECORD_SCHEMA.fields_dict["homePage"].default
|
|
24590
|
+
self.applications = self.RECORD_SCHEMA.fields_dict["applications"].default
|
|
24522
24591
|
|
|
24523
24592
|
|
|
24524
24593
|
@property
|
|
@@ -24561,6 +24630,16 @@ class GlobalSettingsInfoClass(_Aspect):
|
|
|
24561
24630
|
self._inner_dict['homePage'] = value
|
|
24562
24631
|
|
|
24563
24632
|
|
|
24633
|
+
@property
|
|
24634
|
+
def applications(self) -> Union[None, "ApplicationsSettingsClass"]:
|
|
24635
|
+
"""Settings related to applications. If not enabled, applications won't show up in navigation"""
|
|
24636
|
+
return self._inner_dict.get('applications') # type: ignore
|
|
24637
|
+
|
|
24638
|
+
@applications.setter
|
|
24639
|
+
def applications(self, value: Union[None, "ApplicationsSettingsClass"]) -> None:
|
|
24640
|
+
self._inner_dict['applications'] = value
|
|
24641
|
+
|
|
24642
|
+
|
|
24564
24643
|
class GlobalViewsSettingsClass(DictWrapper):
|
|
24565
24644
|
"""Settings for DataHub Views feature."""
|
|
24566
24645
|
|
|
@@ -27086,6 +27165,7 @@ __SCHEMA_TYPES = {
|
|
|
27086
27165
|
'com.linkedin.pegasus2avro.schemafield.SchemaFieldAliases': SchemaFieldAliasesClass,
|
|
27087
27166
|
'com.linkedin.pegasus2avro.schemafield.SchemaFieldInfo': SchemaFieldInfoClass,
|
|
27088
27167
|
'com.linkedin.pegasus2avro.secret.DataHubSecretValue': DataHubSecretValueClass,
|
|
27168
|
+
'com.linkedin.pegasus2avro.settings.global.ApplicationsSettings': ApplicationsSettingsClass,
|
|
27089
27169
|
'com.linkedin.pegasus2avro.settings.global.DocPropagationFeatureSettings': DocPropagationFeatureSettingsClass,
|
|
27090
27170
|
'com.linkedin.pegasus2avro.settings.global.GlobalHomePageSettings': GlobalHomePageSettingsClass,
|
|
27091
27171
|
'com.linkedin.pegasus2avro.settings.global.GlobalSettingsInfo': GlobalSettingsInfoClass,
|
|
@@ -27592,6 +27672,7 @@ __SCHEMA_TYPES = {
|
|
|
27592
27672
|
'SchemaFieldAliases': SchemaFieldAliasesClass,
|
|
27593
27673
|
'SchemaFieldInfo': SchemaFieldInfoClass,
|
|
27594
27674
|
'DataHubSecretValue': DataHubSecretValueClass,
|
|
27675
|
+
'ApplicationsSettings': ApplicationsSettingsClass,
|
|
27595
27676
|
'DocPropagationFeatureSettings': DocPropagationFeatureSettingsClass,
|
|
27596
27677
|
'GlobalHomePageSettings': GlobalHomePageSettingsClass,
|
|
27597
27678
|
'GlobalSettingsInfo': GlobalSettingsInfoClass,
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# pylint: skip-file
|
|
8
8
|
# fmt: off
|
|
9
9
|
# isort: skip_file
|
|
10
|
+
from ......schema_classes import ApplicationsSettingsClass
|
|
10
11
|
from ......schema_classes import DocPropagationFeatureSettingsClass
|
|
11
12
|
from ......schema_classes import GlobalHomePageSettingsClass
|
|
12
13
|
from ......schema_classes import GlobalSettingsInfoClass
|
|
@@ -15,6 +16,7 @@ from ......schema_classes import OidcSettingsClass
|
|
|
15
16
|
from ......schema_classes import SsoSettingsClass
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
ApplicationsSettings = ApplicationsSettingsClass
|
|
18
20
|
DocPropagationFeatureSettings = DocPropagationFeatureSettingsClass
|
|
19
21
|
GlobalHomePageSettings = GlobalHomePageSettingsClass
|
|
20
22
|
GlobalSettingsInfo = GlobalSettingsInfoClass
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -751,9 +751,25 @@
|
|
|
751
751
|
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
752
752
|
},
|
|
753
753
|
"Urn": "Urn",
|
|
754
|
-
"type":
|
|
754
|
+
"type": [
|
|
755
|
+
"null",
|
|
756
|
+
"string"
|
|
757
|
+
],
|
|
755
758
|
"name": "pageTemplate",
|
|
759
|
+
"default": null,
|
|
756
760
|
"doc": "The page template that will be rendered in the UI by default for this user"
|
|
761
|
+
},
|
|
762
|
+
{
|
|
763
|
+
"type": [
|
|
764
|
+
"null",
|
|
765
|
+
{
|
|
766
|
+
"type": "array",
|
|
767
|
+
"items": "string"
|
|
768
|
+
}
|
|
769
|
+
],
|
|
770
|
+
"name": "dismissedAnnouncements",
|
|
771
|
+
"default": null,
|
|
772
|
+
"doc": "The list of announcement urns that have been dismissed by the user"
|
|
757
773
|
}
|
|
758
774
|
],
|
|
759
775
|
"doc": "Settings related to the home page for a user"
|
|
@@ -18531,6 +18547,43 @@
|
|
|
18531
18547
|
"name": "homePage",
|
|
18532
18548
|
"default": null,
|
|
18533
18549
|
"doc": "Global settings related to the home page for an instance"
|
|
18550
|
+
},
|
|
18551
|
+
{
|
|
18552
|
+
"type": [
|
|
18553
|
+
"null",
|
|
18554
|
+
{
|
|
18555
|
+
"type": "record",
|
|
18556
|
+
"name": "ApplicationsSettings",
|
|
18557
|
+
"namespace": "com.linkedin.pegasus2avro.settings.global",
|
|
18558
|
+
"fields": [
|
|
18559
|
+
{
|
|
18560
|
+
"type": "boolean",
|
|
18561
|
+
"name": "enabled"
|
|
18562
|
+
},
|
|
18563
|
+
{
|
|
18564
|
+
"type": [
|
|
18565
|
+
"null",
|
|
18566
|
+
"string"
|
|
18567
|
+
],
|
|
18568
|
+
"name": "config",
|
|
18569
|
+
"default": null,
|
|
18570
|
+
"doc": "The configuration for the feature, in JSON format."
|
|
18571
|
+
},
|
|
18572
|
+
{
|
|
18573
|
+
"type": [
|
|
18574
|
+
"null",
|
|
18575
|
+
"string"
|
|
18576
|
+
],
|
|
18577
|
+
"name": "configVersion",
|
|
18578
|
+
"default": null,
|
|
18579
|
+
"doc": "The version of the configuration schema that has been used to serialize\n the config.\nIf not provided, the version is assumed to be the latest version."
|
|
18580
|
+
}
|
|
18581
|
+
]
|
|
18582
|
+
}
|
|
18583
|
+
],
|
|
18584
|
+
"name": "applications",
|
|
18585
|
+
"default": null,
|
|
18586
|
+
"doc": "Settings related to applications. If not enabled, applications won't show up in navigation"
|
|
18534
18587
|
}
|
|
18535
18588
|
],
|
|
18536
18589
|
"doc": "DataHub Global platform settings. Careful - these should not be modified by the outside world!"
|
|
@@ -172,10 +172,26 @@
|
|
|
172
172
|
"java": {
|
|
173
173
|
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
174
174
|
},
|
|
175
|
-
"type":
|
|
175
|
+
"type": [
|
|
176
|
+
"null",
|
|
177
|
+
"string"
|
|
178
|
+
],
|
|
176
179
|
"name": "pageTemplate",
|
|
180
|
+
"default": null,
|
|
177
181
|
"doc": "The page template that will be rendered in the UI by default for this user",
|
|
178
182
|
"Urn": "Urn"
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"type": [
|
|
186
|
+
"null",
|
|
187
|
+
{
|
|
188
|
+
"type": "array",
|
|
189
|
+
"items": "string"
|
|
190
|
+
}
|
|
191
|
+
],
|
|
192
|
+
"name": "dismissedAnnouncements",
|
|
193
|
+
"default": null,
|
|
194
|
+
"doc": "The list of announcement urns that have been dismissed by the user"
|
|
179
195
|
}
|
|
180
196
|
],
|
|
181
197
|
"doc": "Settings related to the home page for a user"
|
|
@@ -307,6 +307,43 @@
|
|
|
307
307
|
"name": "homePage",
|
|
308
308
|
"default": null,
|
|
309
309
|
"doc": "Global settings related to the home page for an instance"
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
"type": [
|
|
313
|
+
"null",
|
|
314
|
+
{
|
|
315
|
+
"type": "record",
|
|
316
|
+
"name": "ApplicationsSettings",
|
|
317
|
+
"namespace": "com.linkedin.pegasus2avro.settings.global",
|
|
318
|
+
"fields": [
|
|
319
|
+
{
|
|
320
|
+
"type": "boolean",
|
|
321
|
+
"name": "enabled"
|
|
322
|
+
},
|
|
323
|
+
{
|
|
324
|
+
"type": [
|
|
325
|
+
"null",
|
|
326
|
+
"string"
|
|
327
|
+
],
|
|
328
|
+
"name": "config",
|
|
329
|
+
"default": null,
|
|
330
|
+
"doc": "The configuration for the feature, in JSON format."
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
"type": [
|
|
334
|
+
"null",
|
|
335
|
+
"string"
|
|
336
|
+
],
|
|
337
|
+
"name": "configVersion",
|
|
338
|
+
"default": null,
|
|
339
|
+
"doc": "The version of the configuration schema that has been used to serialize\n the config.\nIf not provided, the version is assumed to be the latest version."
|
|
340
|
+
}
|
|
341
|
+
]
|
|
342
|
+
}
|
|
343
|
+
],
|
|
344
|
+
"name": "applications",
|
|
345
|
+
"default": null,
|
|
346
|
+
"doc": "Settings related to applications. If not enabled, applications won't show up in navigation"
|
|
310
347
|
}
|
|
311
348
|
],
|
|
312
349
|
"doc": "DataHub Global platform settings. Careful - these should not be modified by the outside world!"
|
datahub/sdk/lineage_client.py
CHANGED
|
@@ -478,6 +478,7 @@ class LineageClient:
|
|
|
478
478
|
env: str = "PROD",
|
|
479
479
|
default_db: Optional[str] = None,
|
|
480
480
|
default_schema: Optional[str] = None,
|
|
481
|
+
override_dialect: Optional[str] = None,
|
|
481
482
|
) -> None:
|
|
482
483
|
"""Add lineage by parsing a SQL query."""
|
|
483
484
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -493,6 +494,7 @@ class LineageClient:
|
|
|
493
494
|
platform_instance=platform_instance,
|
|
494
495
|
env=env,
|
|
495
496
|
graph=self._client._graph,
|
|
497
|
+
override_dialect=override_dialect,
|
|
496
498
|
)
|
|
497
499
|
|
|
498
500
|
if parsed_result.debug_info.table_error:
|
|
@@ -1494,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1494
1494
|
return
|
|
1495
1495
|
|
|
1496
1496
|
# If a query doesn't involve any allowed tables, skip it.
|
|
1497
|
-
if
|
|
1498
|
-
self.is_allowed_table(
|
|
1499
|
-
):
|
|
1497
|
+
if (
|
|
1498
|
+
downstream_urn is None or not self.is_allowed_table(downstream_urn)
|
|
1499
|
+
) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
|
|
1500
1500
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1501
1501
|
return
|
|
1502
1502
|
|
|
@@ -1577,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1577
1577
|
|
|
1578
1578
|
@dataclasses.dataclass
|
|
1579
1579
|
class QueryLineageInfo:
|
|
1580
|
-
upstreams:
|
|
1581
|
-
|
|
1580
|
+
upstreams: OrderedSet[
|
|
1581
|
+
UrnStr
|
|
1582
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1583
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1582
1584
|
confidence_score: float
|
|
1583
1585
|
|
|
1584
1586
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1585
|
-
self.upstreams
|
|
1586
|
-
self.column_lineage
|
|
1587
|
+
self.upstreams.update(other_query.upstreams)
|
|
1588
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1587
1589
|
self.confidence_score = min(
|
|
1588
1590
|
self.confidence_score, other_query.confidence_score
|
|
1589
1591
|
)
|
|
1590
1592
|
|
|
1593
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1594
|
+
|
|
1591
1595
|
def _recurse_into_query(
|
|
1592
1596
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1593
1597
|
) -> QueryLineageInfo:
|
|
1594
1598
|
if query.query_id in recursion_path:
|
|
1595
1599
|
# This is a cycle, so we just return the query as-is.
|
|
1596
1600
|
return QueryLineageInfo(
|
|
1597
|
-
upstreams=query.upstreams,
|
|
1598
|
-
column_lineage=query.column_lineage,
|
|
1601
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1602
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1599
1603
|
confidence_score=query.confidence_score,
|
|
1600
1604
|
)
|
|
1605
|
+
if query.query_id in cache:
|
|
1606
|
+
return cache[query.query_id]
|
|
1601
1607
|
recursion_path = [*recursion_path, query.query_id]
|
|
1602
1608
|
composed_of_queries.add(query.query_id)
|
|
1603
1609
|
|
|
@@ -1612,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1612
1618
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1613
1619
|
if (
|
|
1614
1620
|
upstream_query
|
|
1615
|
-
and upstream_query.query_id not in
|
|
1621
|
+
and upstream_query.query_id not in recursion_path
|
|
1616
1622
|
):
|
|
1617
1623
|
temp_query_lineage_info = _recurse_into_query(
|
|
1618
1624
|
upstream_query, recursion_path
|
|
@@ -1672,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1672
1678
|
]
|
|
1673
1679
|
)
|
|
1674
1680
|
|
|
1675
|
-
|
|
1676
|
-
upstreams=
|
|
1677
|
-
column_lineage=new_cll,
|
|
1681
|
+
ret = QueryLineageInfo(
|
|
1682
|
+
upstreams=new_upstreams,
|
|
1683
|
+
column_lineage=OrderedSet(new_cll),
|
|
1678
1684
|
confidence_score=new_confidence_score,
|
|
1679
1685
|
)
|
|
1686
|
+
cache[query.query_id] = ret
|
|
1687
|
+
|
|
1688
|
+
return ret
|
|
1680
1689
|
|
|
1681
1690
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1682
1691
|
|
|
@@ -1716,8 +1725,8 @@ class SqlParsingAggregator(Closeable):
|
|
|
1716
1725
|
base_query,
|
|
1717
1726
|
query_id=composite_query_id,
|
|
1718
1727
|
formatted_query_string=merged_query_text,
|
|
1719
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1720
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1728
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1729
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1721
1730
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1722
1731
|
)
|
|
1723
1732
|
|
|
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
|
|
|
56
56
|
QueryTypeProps,
|
|
57
57
|
)
|
|
58
58
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
59
|
+
DialectOrStr,
|
|
59
60
|
get_dialect,
|
|
60
61
|
get_query_fingerprint_debug,
|
|
61
62
|
is_dialect_instance,
|
|
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
|
|
|
124
125
|
|
|
125
126
|
|
|
126
127
|
class DownstreamColumnRef(_ParserBaseModel):
|
|
128
|
+
"""
|
|
129
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
130
|
+
What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
|
|
131
|
+
auto-generated class from .pdl model files. We need generic solution allowing us to either:
|
|
132
|
+
1. Implement hashing for .pdl model objects
|
|
133
|
+
2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
|
|
134
|
+
hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
|
|
135
|
+
to understand that instruction as well.
|
|
136
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
137
|
+
"""
|
|
138
|
+
|
|
127
139
|
table: Optional[Urn] = None
|
|
128
140
|
column: str
|
|
129
141
|
column_type: Optional[SchemaFieldDataTypeClass] = None
|
|
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
|
|
|
139
151
|
return v
|
|
140
152
|
return SchemaFieldDataTypeClass.from_obj(v)
|
|
141
153
|
|
|
154
|
+
def __hash__(self) -> int:
|
|
155
|
+
return hash((self.table, self.column, self.native_column_type))
|
|
142
156
|
|
|
143
|
-
|
|
157
|
+
|
|
158
|
+
class ColumnTransformation(_FrozenModel):
|
|
144
159
|
is_direct_copy: bool
|
|
145
160
|
column_logic: str
|
|
146
161
|
|
|
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
|
|
|
153
168
|
|
|
154
169
|
|
|
155
170
|
class ColumnLineageInfo(_ParserBaseModel):
|
|
171
|
+
"""
|
|
172
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
173
|
+
To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
|
|
174
|
+
depending on it.
|
|
175
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
176
|
+
"""
|
|
177
|
+
|
|
156
178
|
downstream: DownstreamColumnRef
|
|
157
179
|
upstreams: List[ColumnRef]
|
|
158
180
|
|
|
159
181
|
logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
|
|
160
182
|
|
|
183
|
+
def __hash__(self) -> int:
|
|
184
|
+
return hash((self.downstream, tuple(self.upstreams), self.logic))
|
|
185
|
+
|
|
161
186
|
|
|
162
187
|
class _JoinInfo(_ParserBaseModel):
|
|
163
188
|
join_type: str
|
|
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
|
|
|
1231
1256
|
schema_resolver: SchemaResolverInterface,
|
|
1232
1257
|
default_db: Optional[str] = None,
|
|
1233
1258
|
default_schema: Optional[str] = None,
|
|
1234
|
-
|
|
1259
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1235
1260
|
) -> SqlParsingResult:
|
|
1236
|
-
if
|
|
1237
|
-
dialect = get_dialect(
|
|
1261
|
+
if override_dialect:
|
|
1262
|
+
dialect = get_dialect(override_dialect)
|
|
1238
1263
|
else:
|
|
1239
|
-
dialect = get_dialect(
|
|
1264
|
+
dialect = get_dialect(schema_resolver.platform)
|
|
1240
1265
|
|
|
1241
1266
|
default_db = _normalize_db_or_schema(default_db, dialect)
|
|
1242
1267
|
default_schema = _normalize_db_or_schema(default_schema, dialect)
|
|
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1423
1448
|
schema_resolver: SchemaResolverInterface,
|
|
1424
1449
|
default_db: Optional[str] = None,
|
|
1425
1450
|
default_schema: Optional[str] = None,
|
|
1426
|
-
|
|
1451
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1427
1452
|
) -> SqlParsingResult:
|
|
1428
1453
|
"""Parse a SQL statement and generate lineage information.
|
|
1429
1454
|
|
|
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
|
|
|
1441
1466
|
can be brittle with respect to missing schema information and complex
|
|
1442
1467
|
SQL logic like UNNESTs.
|
|
1443
1468
|
|
|
1444
|
-
The SQL dialect
|
|
1445
|
-
be
|
|
1469
|
+
The SQL dialect will be inferred from the schema_resolver's platform.
|
|
1470
|
+
That inference can be overridden by passing an override_dialect argument.
|
|
1446
1471
|
The set of supported dialects is the same as sqlglot's. See their
|
|
1447
1472
|
`documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
|
|
1448
1473
|
for the full list.
|
|
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1457
1482
|
schema_resolver: The schema resolver to use for resolving table schemas.
|
|
1458
1483
|
default_db: The default database to use for unqualified table names.
|
|
1459
1484
|
default_schema: The default schema to use for unqualified table names.
|
|
1460
|
-
|
|
1485
|
+
override_dialect: Override the dialect provided by 'schema_resolver'.
|
|
1461
1486
|
|
|
1462
1487
|
Returns:
|
|
1463
1488
|
A SqlParsingResult object containing the parsed lineage information.
|
|
@@ -1482,7 +1507,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1482
1507
|
schema_resolver=schema_resolver,
|
|
1483
1508
|
default_db=default_db,
|
|
1484
1509
|
default_schema=default_schema,
|
|
1485
|
-
|
|
1510
|
+
override_dialect=override_dialect,
|
|
1486
1511
|
)
|
|
1487
1512
|
except Exception as e:
|
|
1488
1513
|
return SqlParsingResult.make_from_error(e)
|
|
@@ -1520,15 +1545,15 @@ def sqlglot_lineage(
|
|
|
1520
1545
|
schema_resolver: SchemaResolverInterface,
|
|
1521
1546
|
default_db: Optional[str] = None,
|
|
1522
1547
|
default_schema: Optional[str] = None,
|
|
1523
|
-
|
|
1548
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1524
1549
|
) -> SqlParsingResult:
|
|
1525
1550
|
if schema_resolver.includes_temp_tables():
|
|
1526
1551
|
return _sqlglot_lineage_nocache(
|
|
1527
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1552
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1528
1553
|
)
|
|
1529
1554
|
else:
|
|
1530
1555
|
return _sqlglot_lineage_cached(
|
|
1531
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1556
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1532
1557
|
)
|
|
1533
1558
|
|
|
1534
1559
|
|
|
@@ -1580,6 +1605,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1580
1605
|
default_schema: Optional[str] = None,
|
|
1581
1606
|
graph: Optional[DataHubGraph] = None,
|
|
1582
1607
|
schema_aware: bool = True,
|
|
1608
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1583
1609
|
) -> SqlParsingResult:
|
|
1584
1610
|
schema_resolver = create_schema_resolver(
|
|
1585
1611
|
platform=platform,
|
|
@@ -1599,6 +1625,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1599
1625
|
schema_resolver=schema_resolver,
|
|
1600
1626
|
default_db=default_db,
|
|
1601
1627
|
default_schema=default_schema,
|
|
1628
|
+
override_dialect=override_dialect,
|
|
1602
1629
|
)
|
|
1603
1630
|
except Exception as e:
|
|
1604
1631
|
return SqlParsingResult.make_from_error(e)
|