acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
|
|
|
282
282
|
query: str
|
|
283
283
|
transforms: list
|
|
284
284
|
|
|
285
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
286
|
-
logger.warning(f"{key}: {reason}")
|
|
287
|
-
self.report.report_warning(key, reason)
|
|
288
|
-
|
|
289
285
|
def get_parser(
|
|
290
286
|
self,
|
|
291
287
|
connector_manifest: ConnectorManifest,
|
|
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
355
351
|
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
356
352
|
else:
|
|
357
353
|
include_source_dataset = False
|
|
358
|
-
self.
|
|
359
|
-
|
|
360
|
-
f"
|
|
354
|
+
self.report.warning(
|
|
355
|
+
"Could not find schema for table"
|
|
356
|
+
f"{self.connector_manifest.name} : {source_table}",
|
|
361
357
|
)
|
|
362
358
|
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
363
359
|
lineage = KafkaConnectLineage(
|
|
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
457
453
|
target_platform=KAFKA,
|
|
458
454
|
)
|
|
459
455
|
lineages.append(lineage)
|
|
460
|
-
self.
|
|
456
|
+
self.report.warning(
|
|
457
|
+
"Could not find input dataset, the connector has query configuration set",
|
|
461
458
|
self.connector_manifest.name,
|
|
462
|
-
"could not find input dataset, the connector has query configuration set",
|
|
463
459
|
)
|
|
464
460
|
self.connector_manifest.lineages = lineages
|
|
465
461
|
return
|
|
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
|
|
|
535
531
|
include_source_dataset=False,
|
|
536
532
|
)
|
|
537
533
|
)
|
|
538
|
-
self.
|
|
539
|
-
|
|
540
|
-
f"
|
|
534
|
+
self.report.warning(
|
|
535
|
+
"Could not find input dataset for connector topics",
|
|
536
|
+
f"{self.connector_manifest.name} : {topic_names}",
|
|
541
537
|
)
|
|
542
538
|
self.connector_manifest.lineages = lineages
|
|
543
539
|
return
|
|
544
540
|
else:
|
|
545
541
|
include_source_dataset = True
|
|
546
542
|
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
547
|
-
self.
|
|
548
|
-
|
|
549
|
-
f"
|
|
543
|
+
self.report.warning(
|
|
544
|
+
"Could not find input dataset, connector has unknown transform",
|
|
545
|
+
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
550
546
|
)
|
|
551
547
|
include_source_dataset = False
|
|
552
548
|
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
553
|
-
self.
|
|
549
|
+
self.report.warning(
|
|
550
|
+
"Could not find input dataset, connector has one or more unknown transforms",
|
|
554
551
|
self.connector_manifest.name,
|
|
555
|
-
"could not find input dataset, connector has one or more unknown transforms",
|
|
556
552
|
)
|
|
557
553
|
include_source_dataset = False
|
|
558
554
|
lineages = self.default_get_lineages(
|
|
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
|
|
|
753
749
|
lineages.append(lineage)
|
|
754
750
|
self.connector_manifest.lineages = lineages
|
|
755
751
|
except Exception as e:
|
|
756
|
-
self.report.
|
|
757
|
-
|
|
752
|
+
self.report.warning(
|
|
753
|
+
"Error resolving lineage for connector",
|
|
754
|
+
self.connector_manifest.name,
|
|
755
|
+
exc=e,
|
|
758
756
|
)
|
|
759
757
|
|
|
760
758
|
return
|
|
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
|
|
|
783
781
|
defaultDataset: Optional[str] = None
|
|
784
782
|
version: str = "v1"
|
|
785
783
|
|
|
786
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
787
|
-
logger.warning(f"{key}: {reason}")
|
|
788
|
-
self.report.report_warning(key, reason)
|
|
789
|
-
|
|
790
784
|
def get_parser(
|
|
791
785
|
self,
|
|
792
786
|
connector_manifest: ConnectorManifest,
|
|
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
|
|
|
917
911
|
transformed_topic = self.apply_transformations(topic, transforms)
|
|
918
912
|
dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
|
|
919
913
|
if dataset_table is None:
|
|
920
|
-
self.
|
|
921
|
-
|
|
922
|
-
f"
|
|
914
|
+
self.report.warning(
|
|
915
|
+
"Could not find target dataset for topic, please check your connector configuration"
|
|
916
|
+
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
923
917
|
)
|
|
924
918
|
continue
|
|
925
919
|
target_dataset = f"{project}.{dataset_table}"
|
|
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
|
|
|
954
948
|
schema_name: str
|
|
955
949
|
topics_to_tables: Dict[str, str]
|
|
956
950
|
|
|
957
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
958
|
-
logger.warning(f"{key}: {reason}")
|
|
959
|
-
self.report.report_warning(key, reason)
|
|
960
|
-
|
|
961
951
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
962
952
|
"""
|
|
963
953
|
This function converts the topic name to a valid Snowflake table name using some rules.
|
|
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
|
|
|
1105
1095
|
)
|
|
1106
1096
|
self.connector_manifest.lineages = lineages
|
|
1107
1097
|
except Exception as e:
|
|
1108
|
-
self.report.
|
|
1109
|
-
|
|
1098
|
+
self.report.warning(
|
|
1099
|
+
"Error resolving lineage for connector",
|
|
1100
|
+
self.connector_manifest.name,
|
|
1101
|
+
exc=e,
|
|
1110
1102
|
)
|
|
1111
1103
|
|
|
1112
1104
|
return
|
|
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1155
1147
|
)
|
|
1156
1148
|
self.session.auth = (self.config.username, self.config.password)
|
|
1157
1149
|
|
|
1158
|
-
test_response = self.session.get(f"{self.config.connect_uri}")
|
|
1150
|
+
test_response = self.session.get(f"{self.config.connect_uri}/connectors")
|
|
1159
1151
|
test_response.raise_for_status()
|
|
1160
1152
|
logger.info(f"Connection to {self.config.connect_uri} is ok")
|
|
1161
1153
|
if not jpype.isJVMStarted():
|
|
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1178
1170
|
|
|
1179
1171
|
payload = connector_response.json()
|
|
1180
1172
|
|
|
1181
|
-
for
|
|
1182
|
-
connector_url = f"{self.config.connect_uri}/connectors/{
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
if
|
|
1187
|
-
|
|
1173
|
+
for connector_name in payload:
|
|
1174
|
+
connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
|
|
1175
|
+
connector_manifest = self._get_connector_manifest(
|
|
1176
|
+
connector_name, connector_url
|
|
1177
|
+
)
|
|
1178
|
+
if (
|
|
1179
|
+
connector_manifest is None
|
|
1180
|
+
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
1181
|
+
):
|
|
1182
|
+
self.report.report_dropped(connector_name)
|
|
1188
1183
|
continue
|
|
1189
1184
|
|
|
1190
1185
|
if self.config.provided_configs:
|
|
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1195
1190
|
connector_manifest.lineages = list()
|
|
1196
1191
|
connector_manifest.url = connector_url
|
|
1197
1192
|
|
|
1198
|
-
|
|
1199
|
-
f"{self.config.connect_uri}/connectors/{c}/topics",
|
|
1200
|
-
).json()
|
|
1201
|
-
|
|
1202
|
-
connector_manifest.topic_names = topics[c]["topics"]
|
|
1193
|
+
connector_manifest.topic_names = self._get_connector_topics(connector_name)
|
|
1203
1194
|
|
|
1204
1195
|
# Populate Source Connector metadata
|
|
1205
1196
|
if connector_manifest.type == SOURCE:
|
|
1206
|
-
tasks = self.
|
|
1207
|
-
f"{self.config.connect_uri}/connectors/{c}/tasks",
|
|
1208
|
-
).json()
|
|
1209
|
-
|
|
1210
|
-
connector_manifest.tasks = tasks
|
|
1197
|
+
connector_manifest.tasks = self._get_connector_tasks(connector_name)
|
|
1211
1198
|
|
|
1212
1199
|
# JDBC source connector lineages
|
|
1213
1200
|
if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
|
|
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1246
1233
|
)
|
|
1247
1234
|
continue
|
|
1248
1235
|
|
|
1249
|
-
for topic in
|
|
1236
|
+
for topic in connector_manifest.topic_names:
|
|
1250
1237
|
lineage = KafkaConnectLineage(
|
|
1251
1238
|
source_dataset=target_connector.source_dataset,
|
|
1252
1239
|
source_platform=target_connector.source_platform,
|
|
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1286
1273
|
|
|
1287
1274
|
return connectors_manifest
|
|
1288
1275
|
|
|
1276
|
+
def _get_connector_manifest(
|
|
1277
|
+
self, connector_name: str, connector_url: str
|
|
1278
|
+
) -> Optional[ConnectorManifest]:
|
|
1279
|
+
try:
|
|
1280
|
+
connector_response = self.session.get(connector_url)
|
|
1281
|
+
connector_response.raise_for_status()
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
self.report.warning(
|
|
1284
|
+
"Failed to get connector details", connector_name, exc=e
|
|
1285
|
+
)
|
|
1286
|
+
return None
|
|
1287
|
+
manifest = connector_response.json()
|
|
1288
|
+
connector_manifest = ConnectorManifest(**manifest)
|
|
1289
|
+
return connector_manifest
|
|
1290
|
+
|
|
1291
|
+
def _get_connector_tasks(self, connector_name: str) -> dict:
|
|
1292
|
+
try:
|
|
1293
|
+
response = self.session.get(
|
|
1294
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
|
|
1295
|
+
)
|
|
1296
|
+
response.raise_for_status()
|
|
1297
|
+
except Exception as e:
|
|
1298
|
+
self.report.warning(
|
|
1299
|
+
"Error getting connector tasks", context=connector_name, exc=e
|
|
1300
|
+
)
|
|
1301
|
+
return {}
|
|
1302
|
+
|
|
1303
|
+
return response.json()
|
|
1304
|
+
|
|
1305
|
+
def _get_connector_topics(self, connector_name: str) -> List[str]:
|
|
1306
|
+
try:
|
|
1307
|
+
response = self.session.get(
|
|
1308
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
1309
|
+
)
|
|
1310
|
+
response.raise_for_status()
|
|
1311
|
+
except Exception as e:
|
|
1312
|
+
self.report.warning(
|
|
1313
|
+
"Error getting connector topics", context=connector_name, exc=e
|
|
1314
|
+
)
|
|
1315
|
+
return []
|
|
1316
|
+
|
|
1317
|
+
return response.json()[connector_name]["topics"]
|
|
1318
|
+
|
|
1289
1319
|
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
1290
1320
|
connector_name = connector.name
|
|
1291
1321
|
connector_type = connector.type
|
|
@@ -4,6 +4,7 @@ from typing import ClassVar, Optional, TextIO
|
|
|
4
4
|
from liquid import Environment
|
|
5
5
|
from liquid.ast import Node
|
|
6
6
|
from liquid.context import Context
|
|
7
|
+
from liquid.filter import string_filter
|
|
7
8
|
from liquid.parse import expect, get_parser
|
|
8
9
|
from liquid.stream import TokenStream
|
|
9
10
|
from liquid.tag import Tag
|
|
@@ -81,12 +82,18 @@ class ConditionTag(Tag):
|
|
|
81
82
|
custom_tags = [ConditionTag]
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
@string_filter
|
|
86
|
+
def sql_quote_filter(variable: str) -> str:
|
|
87
|
+
return f"'{variable}'"
|
|
88
|
+
|
|
89
|
+
|
|
84
90
|
@lru_cache(maxsize=1)
|
|
85
91
|
def _create_env() -> Environment:
|
|
86
|
-
env: Environment = Environment()
|
|
92
|
+
env: Environment = Environment(strict_filters=False)
|
|
87
93
|
# register tag. One time activity
|
|
88
94
|
for custom_tag in custom_tags:
|
|
89
95
|
env.add_tag(custom_tag)
|
|
96
|
+
env.add_filter("sql_quote", sql_quote_filter)
|
|
90
97
|
return env
|
|
91
98
|
|
|
92
99
|
|
|
@@ -88,8 +88,7 @@ class LookerFieldContext:
|
|
|
88
88
|
for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql):
|
|
89
89
|
matched_field = upstream_field_match.group(1)
|
|
90
90
|
# Remove quotes from field names
|
|
91
|
-
matched_field
|
|
92
|
-
column_names.append(matched_field)
|
|
91
|
+
column_names.append(matched_field.replace('"', "").replace("`", "").lower())
|
|
93
92
|
|
|
94
93
|
return column_names
|
|
95
94
|
|
|
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
|
|
|
25
25
|
LookMLSourceReport,
|
|
26
26
|
)
|
|
27
27
|
from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
|
|
28
|
+
from datahub.sql_parsing.schema_resolver import match_columns_to_schema
|
|
28
29
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
29
30
|
ColumnLineageInfo,
|
|
30
31
|
ColumnRef,
|
|
31
32
|
SqlParsingResult,
|
|
32
33
|
Urn,
|
|
34
|
+
create_and_cache_schema_resolver,
|
|
33
35
|
create_lineage_sql_parsed_result,
|
|
34
36
|
)
|
|
35
37
|
|
|
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
|
|
|
200
202
|
class AbstractViewUpstream(ABC):
|
|
201
203
|
"""
|
|
202
204
|
Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
|
|
203
|
-
For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
|
|
205
|
+
For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
|
|
204
206
|
"""
|
|
205
207
|
|
|
206
208
|
view_context: LookerViewContext
|
|
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
|
|
|
236
238
|
def create_fields(self) -> List[ViewField]:
|
|
237
239
|
return [] # it is for the special case
|
|
238
240
|
|
|
241
|
+
def create_upstream_column_refs(
|
|
242
|
+
self, upstream_urn: str, downstream_looker_columns: List[str]
|
|
243
|
+
) -> List[ColumnRef]:
|
|
244
|
+
"""
|
|
245
|
+
- **`upstream_urn`**: The URN of the upstream dataset.
|
|
246
|
+
|
|
247
|
+
- **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
|
|
248
|
+
|
|
249
|
+
- This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
|
|
250
|
+
"""
|
|
251
|
+
schema_resolver = create_and_cache_schema_resolver(
|
|
252
|
+
platform=self.view_context.view_connection.platform,
|
|
253
|
+
platform_instance=self.view_context.view_connection.platform_instance,
|
|
254
|
+
env=self.view_context.view_connection.platform_env or self.config.env,
|
|
255
|
+
graph=self.ctx.graph,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
|
|
259
|
+
|
|
260
|
+
if schema_info:
|
|
261
|
+
actual_columns = match_columns_to_schema(
|
|
262
|
+
schema_info, downstream_looker_columns
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
|
|
267
|
+
)
|
|
268
|
+
actual_columns = [column.lower() for column in downstream_looker_columns]
|
|
269
|
+
|
|
270
|
+
upstream_column_refs: List[ColumnRef] = []
|
|
271
|
+
|
|
272
|
+
for column in actual_columns:
|
|
273
|
+
upstream_column_refs.append(
|
|
274
|
+
ColumnRef(
|
|
275
|
+
column=column,
|
|
276
|
+
table=upstream_urn,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return upstream_column_refs
|
|
281
|
+
|
|
239
282
|
|
|
240
283
|
class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
241
284
|
"""
|
|
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
|
372
415
|
# in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
|
|
373
416
|
# referring to upstream table
|
|
374
417
|
if self._get_upstream_dataset_urn() and not upstreams_column_refs:
|
|
375
|
-
upstreams_column_refs =
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
)
|
|
382
|
-
for column in field_context.column_name_in_sql_attribute()
|
|
383
|
-
]
|
|
418
|
+
upstreams_column_refs = self.create_upstream_column_refs(
|
|
419
|
+
upstream_urn=self._get_upstream_dataset_urn()[
|
|
420
|
+
0
|
|
421
|
+
], # 0th index has table of from clause,
|
|
422
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
423
|
+
)
|
|
384
424
|
|
|
385
425
|
# fix any derived view reference present in urn
|
|
386
426
|
upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
|
|
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
487
527
|
return upstream_column_refs
|
|
488
528
|
|
|
489
529
|
explore_urn: str = self._get_upstream_dataset_urn()[0]
|
|
530
|
+
expected_columns: List[str] = []
|
|
490
531
|
|
|
491
532
|
for column in field_context.column_name_in_sql_attribute():
|
|
492
533
|
if column in self._get_explore_column_mapping():
|
|
493
534
|
explore_column: Dict = self._get_explore_column_mapping()[column]
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
column=explore_column.get("field", explore_column[NAME]),
|
|
497
|
-
table=explore_urn,
|
|
498
|
-
)
|
|
535
|
+
expected_columns.append(
|
|
536
|
+
explore_column.get("field", explore_column[NAME])
|
|
499
537
|
)
|
|
500
538
|
|
|
501
|
-
return
|
|
539
|
+
return self.create_upstream_column_refs(
|
|
540
|
+
upstream_urn=explore_urn, downstream_looker_columns=expected_columns
|
|
541
|
+
)
|
|
502
542
|
|
|
503
543
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
504
544
|
return self._get_upstream_dataset_urn()
|
|
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
|
|
|
548
588
|
def get_upstream_column_ref(
|
|
549
589
|
self, field_context: LookerFieldContext
|
|
550
590
|
) -> List[ColumnRef]:
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
|
|
556
|
-
)
|
|
557
|
-
|
|
558
|
-
return upstream_column_ref
|
|
591
|
+
return self.create_upstream_column_refs(
|
|
592
|
+
upstream_urn=self._get_upstream_dataset_urn(),
|
|
593
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
594
|
+
)
|
|
559
595
|
|
|
560
596
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
561
597
|
return [self._get_upstream_dataset_urn()]
|
|
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
|
|
|
609
645
|
self, field_context: LookerFieldContext
|
|
610
646
|
) -> List[ColumnRef]:
|
|
611
647
|
upstream_column_ref: List[ColumnRef] = []
|
|
648
|
+
|
|
612
649
|
if not self._get_upstream_dataset_urn():
|
|
613
650
|
return upstream_column_ref
|
|
614
651
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return upstream_column_ref
|
|
652
|
+
return self.create_upstream_column_refs(
|
|
653
|
+
upstream_urn=self._get_upstream_dataset_urn()[0],
|
|
654
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
655
|
+
)
|
|
621
656
|
|
|
622
657
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
623
658
|
return self._get_upstream_dataset_urn()
|
|
@@ -45,6 +45,9 @@ class Owners(ConfigModel):
|
|
|
45
45
|
groups: Optional[List[str]] = None
|
|
46
46
|
|
|
47
47
|
|
|
48
|
+
OwnersMultipleTypes = Union[List[Owners], Owners]
|
|
49
|
+
|
|
50
|
+
|
|
48
51
|
class KnowledgeCard(ConfigModel):
|
|
49
52
|
url: Optional[str] = None
|
|
50
53
|
label: Optional[str] = None
|
|
@@ -57,7 +60,7 @@ class GlossaryTermConfig(ConfigModel):
|
|
|
57
60
|
term_source: Optional[str] = None
|
|
58
61
|
source_ref: Optional[str] = None
|
|
59
62
|
source_url: Optional[str] = None
|
|
60
|
-
owners: Optional[
|
|
63
|
+
owners: Optional[OwnersMultipleTypes] = None
|
|
61
64
|
inherits: Optional[List[str]] = None
|
|
62
65
|
contains: Optional[List[str]] = None
|
|
63
66
|
values: Optional[List[str]] = None
|
|
@@ -74,7 +77,7 @@ class GlossaryNodeConfig(ConfigModel):
|
|
|
74
77
|
id: Optional[str] = None
|
|
75
78
|
name: str
|
|
76
79
|
description: str
|
|
77
|
-
owners: Optional[
|
|
80
|
+
owners: Optional[OwnersMultipleTypes] = None
|
|
78
81
|
terms: Optional[List["GlossaryTermConfig"]] = None
|
|
79
82
|
nodes: Optional[List["GlossaryNodeConfig"]] = None
|
|
80
83
|
knowledge_links: Optional[List[KnowledgeCard]] = None
|
|
@@ -88,7 +91,7 @@ class DefaultConfig(ConfigModel):
|
|
|
88
91
|
"""Holds defaults for populating fields in glossary terms"""
|
|
89
92
|
|
|
90
93
|
source: Optional[str] = None
|
|
91
|
-
owners:
|
|
94
|
+
owners: OwnersMultipleTypes
|
|
92
95
|
url: Optional[str] = None
|
|
93
96
|
source_type: str = "INTERNAL"
|
|
94
97
|
|
|
@@ -153,30 +156,44 @@ def make_glossary_term_urn(
|
|
|
153
156
|
return "urn:li:glossaryTerm:" + create_id(path, default_id, enable_auto_id)
|
|
154
157
|
|
|
155
158
|
|
|
156
|
-
def
|
|
157
|
-
|
|
159
|
+
def get_owners_multiple_types(owners: OwnersMultipleTypes) -> models.OwnershipClass:
|
|
160
|
+
"""Allows owner types to be a list and maintains backward compatibility"""
|
|
161
|
+
if isinstance(owners, Owners):
|
|
162
|
+
return models.OwnershipClass(owners=list(get_owners(owners)))
|
|
163
|
+
|
|
164
|
+
owners_meta: List[models.OwnerClass] = []
|
|
165
|
+
for owner in owners:
|
|
166
|
+
owners_meta.extend(get_owners(owner))
|
|
167
|
+
|
|
168
|
+
return models.OwnershipClass(owners=owners_meta)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_owners(owners: Owners) -> Iterable[models.OwnerClass]:
|
|
172
|
+
actual_type = owners.type or models.OwnershipTypeClass.DEVELOPER
|
|
173
|
+
|
|
174
|
+
if actual_type.startswith("urn:li:ownershipType:"):
|
|
175
|
+
ownership_type: str = "CUSTOM"
|
|
176
|
+
ownership_type_urn: Optional[str] = actual_type
|
|
177
|
+
else:
|
|
178
|
+
ownership_type, ownership_type_urn = validate_ownership_type(actual_type)
|
|
179
|
+
|
|
158
180
|
if owners.typeUrn is not None:
|
|
159
181
|
ownership_type_urn = owners.typeUrn
|
|
160
|
-
|
|
182
|
+
|
|
161
183
|
if owners.users is not None:
|
|
162
|
-
|
|
163
|
-
models.OwnerClass(
|
|
184
|
+
for o in owners.users:
|
|
185
|
+
yield models.OwnerClass(
|
|
164
186
|
owner=make_user_urn(o),
|
|
165
187
|
type=ownership_type,
|
|
166
188
|
typeUrn=ownership_type_urn,
|
|
167
189
|
)
|
|
168
|
-
for o in owners.users
|
|
169
|
-
]
|
|
170
190
|
if owners.groups is not None:
|
|
171
|
-
|
|
172
|
-
models.OwnerClass(
|
|
191
|
+
for o in owners.groups:
|
|
192
|
+
yield models.OwnerClass(
|
|
173
193
|
owner=make_group_urn(o),
|
|
174
194
|
type=ownership_type,
|
|
175
195
|
typeUrn=ownership_type_urn,
|
|
176
196
|
)
|
|
177
|
-
for o in owners.groups
|
|
178
|
-
]
|
|
179
|
-
return models.OwnershipClass(owners=owners_meta)
|
|
180
197
|
|
|
181
198
|
|
|
182
199
|
def get_mces(
|
|
@@ -185,7 +202,7 @@ def get_mces(
|
|
|
185
202
|
ingestion_config: BusinessGlossarySourceConfig,
|
|
186
203
|
ctx: PipelineContext,
|
|
187
204
|
) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]:
|
|
188
|
-
root_owners =
|
|
205
|
+
root_owners = get_owners_multiple_types(glossary.owners)
|
|
189
206
|
|
|
190
207
|
if glossary.nodes:
|
|
191
208
|
for node in glossary.nodes:
|
|
@@ -270,7 +287,7 @@ def get_mces_from_node(
|
|
|
270
287
|
node_owners = parentOwners
|
|
271
288
|
if glossaryNode.owners is not None:
|
|
272
289
|
assert glossaryNode.owners is not None
|
|
273
|
-
node_owners =
|
|
290
|
+
node_owners = get_owners_multiple_types(glossaryNode.owners)
|
|
274
291
|
|
|
275
292
|
node_snapshot = models.GlossaryNodeSnapshotClass(
|
|
276
293
|
urn=node_urn,
|
|
@@ -426,7 +443,7 @@ def get_mces_from_term(
|
|
|
426
443
|
ownership: models.OwnershipClass = parentOwnership
|
|
427
444
|
if glossaryTerm.owners is not None:
|
|
428
445
|
assert glossaryTerm.owners is not None
|
|
429
|
-
ownership =
|
|
446
|
+
ownership = get_owners_multiple_types(glossaryTerm.owners)
|
|
430
447
|
aspects.append(ownership)
|
|
431
448
|
|
|
432
449
|
if glossaryTerm.domain is not None:
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -18,7 +18,6 @@ from pydantic import Field, validator
|
|
|
18
18
|
from requests.adapters import HTTPAdapter, Retry
|
|
19
19
|
from requests.exceptions import ConnectionError
|
|
20
20
|
from requests.models import HTTPBasicAuth, HTTPError
|
|
21
|
-
from sqllineage.runner import LineageRunner
|
|
22
21
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
23
22
|
|
|
24
23
|
import datahub.emitter.mce_builder as builder
|
|
@@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
820
819
|
)
|
|
821
820
|
return None
|
|
822
821
|
|
|
823
|
-
@lru_cache(maxsize=None)
|
|
824
|
-
def _get_source_from_query(self, raw_query: str) -> set:
|
|
825
|
-
query = self._replace_definitions(raw_query)
|
|
826
|
-
parser = LineageRunner(query)
|
|
827
|
-
source_paths = set()
|
|
828
|
-
try:
|
|
829
|
-
for table in parser.source_tables:
|
|
830
|
-
sources = str(table).split(".")
|
|
831
|
-
source_schema, source_table = sources[-2], sources[-1]
|
|
832
|
-
if source_schema == "<default>":
|
|
833
|
-
source_schema = str(self.config.default_schema)
|
|
834
|
-
|
|
835
|
-
source_paths.add(f"{source_schema}.{source_table}")
|
|
836
|
-
except Exception as e:
|
|
837
|
-
self.report.report_failure(
|
|
838
|
-
title="Failed to Extract Lineage From Query",
|
|
839
|
-
message="Unable to retrieve lineage from Mode query.",
|
|
840
|
-
context=f"Query: {raw_query}, Error: {str(e)}",
|
|
841
|
-
)
|
|
842
|
-
|
|
843
|
-
return source_paths
|
|
844
|
-
|
|
845
822
|
def _get_datasource_urn(
|
|
846
823
|
self,
|
|
847
824
|
platform: str,
|
|
File without changes
|