acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,18 @@ import functools
|
|
|
5
5
|
import logging
|
|
6
6
|
import traceback
|
|
7
7
|
from collections import defaultdict
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import (
|
|
9
|
+
AbstractSet,
|
|
10
|
+
Any,
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Set,
|
|
16
|
+
Tuple,
|
|
17
|
+
TypeVar,
|
|
18
|
+
Union,
|
|
19
|
+
)
|
|
9
20
|
|
|
10
21
|
import pydantic.dataclasses
|
|
11
22
|
import sqlglot
|
|
@@ -45,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
|
|
|
45
56
|
QueryTypeProps,
|
|
46
57
|
)
|
|
47
58
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
59
|
+
DialectOrStr,
|
|
48
60
|
get_dialect,
|
|
49
61
|
get_query_fingerprint_debug,
|
|
50
62
|
is_dialect_instance,
|
|
@@ -54,6 +66,7 @@ from datahub.utilities.cooperative_timeout import (
|
|
|
54
66
|
CooperativeTimeoutError,
|
|
55
67
|
cooperative_timeout,
|
|
56
68
|
)
|
|
69
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
57
70
|
|
|
58
71
|
assert SQLGLOT_PATCHED
|
|
59
72
|
|
|
@@ -112,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
|
|
|
112
125
|
|
|
113
126
|
|
|
114
127
|
class DownstreamColumnRef(_ParserBaseModel):
|
|
128
|
+
"""
|
|
129
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
130
|
+
What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
|
|
131
|
+
auto-generated class from .pdl model files. We need generic solution allowing us to either:
|
|
132
|
+
1. Implement hashing for .pdl model objects
|
|
133
|
+
2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
|
|
134
|
+
hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
|
|
135
|
+
to understand that instruction as well.
|
|
136
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
137
|
+
"""
|
|
138
|
+
|
|
115
139
|
table: Optional[Urn] = None
|
|
116
140
|
column: str
|
|
117
141
|
column_type: Optional[SchemaFieldDataTypeClass] = None
|
|
@@ -127,20 +151,53 @@ class DownstreamColumnRef(_ParserBaseModel):
|
|
|
127
151
|
return v
|
|
128
152
|
return SchemaFieldDataTypeClass.from_obj(v)
|
|
129
153
|
|
|
154
|
+
def __hash__(self) -> int:
|
|
155
|
+
return hash((self.table, self.column, self.native_column_type))
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class ColumnTransformation(_FrozenModel):
|
|
159
|
+
is_direct_copy: bool
|
|
160
|
+
column_logic: str
|
|
161
|
+
|
|
130
162
|
|
|
131
163
|
class _ColumnLineageInfo(_ParserBaseModel):
|
|
132
164
|
downstream: _DownstreamColumnRef
|
|
133
165
|
upstreams: List[_ColumnRef]
|
|
134
166
|
|
|
135
|
-
logic: Optional[
|
|
167
|
+
logic: Optional[ColumnTransformation] = None
|
|
136
168
|
|
|
137
169
|
|
|
138
170
|
class ColumnLineageInfo(_ParserBaseModel):
|
|
171
|
+
"""
|
|
172
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
173
|
+
To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
|
|
174
|
+
depending on it.
|
|
175
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
176
|
+
"""
|
|
177
|
+
|
|
139
178
|
downstream: DownstreamColumnRef
|
|
140
179
|
upstreams: List[ColumnRef]
|
|
141
180
|
|
|
142
|
-
|
|
143
|
-
|
|
181
|
+
logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
|
|
182
|
+
|
|
183
|
+
def __hash__(self) -> int:
|
|
184
|
+
return hash((self.downstream, tuple(self.upstreams), self.logic))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class _JoinInfo(_ParserBaseModel):
|
|
188
|
+
join_type: str
|
|
189
|
+
left_tables: List[_TableName]
|
|
190
|
+
right_tables: List[_TableName]
|
|
191
|
+
on_clause: Optional[str]
|
|
192
|
+
columns_involved: List[_ColumnRef]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class JoinInfo(_ParserBaseModel):
|
|
196
|
+
join_type: str
|
|
197
|
+
left_tables: List[Urn]
|
|
198
|
+
right_tables: List[Urn]
|
|
199
|
+
on_clause: Optional[str]
|
|
200
|
+
columns_involved: List[ColumnRef]
|
|
144
201
|
|
|
145
202
|
|
|
146
203
|
class SqlParsingDebugInfo(_ParserBaseModel):
|
|
@@ -178,6 +235,7 @@ class SqlParsingResult(_ParserBaseModel):
|
|
|
178
235
|
out_tables: List[Urn]
|
|
179
236
|
|
|
180
237
|
column_lineage: Optional[List[ColumnLineageInfo]] = None
|
|
238
|
+
joins: Optional[List[JoinInfo]] = None
|
|
181
239
|
|
|
182
240
|
# TODO include formatted original sql logic
|
|
183
241
|
# TODO include list of referenced columns
|
|
@@ -197,13 +255,19 @@ class SqlParsingResult(_ParserBaseModel):
|
|
|
197
255
|
)
|
|
198
256
|
|
|
199
257
|
|
|
258
|
+
def _extract_table_names(
|
|
259
|
+
iterable: Iterable[sqlglot.exp.Table],
|
|
260
|
+
) -> OrderedSet[_TableName]:
|
|
261
|
+
return OrderedSet(_TableName.from_sqlglot_table(table) for table in iterable)
|
|
262
|
+
|
|
263
|
+
|
|
200
264
|
def _table_level_lineage(
|
|
201
265
|
statement: sqlglot.Expression, dialect: sqlglot.Dialect
|
|
202
|
-
) -> Tuple[
|
|
266
|
+
) -> Tuple[AbstractSet[_TableName], AbstractSet[_TableName]]:
|
|
203
267
|
# Generate table-level lineage.
|
|
204
268
|
modified = (
|
|
205
|
-
|
|
206
|
-
|
|
269
|
+
_extract_table_names(
|
|
270
|
+
expr.this
|
|
207
271
|
for expr in statement.find_all(
|
|
208
272
|
sqlglot.exp.Create,
|
|
209
273
|
sqlglot.exp.Insert,
|
|
@@ -215,36 +279,36 @@ def _table_level_lineage(
|
|
|
215
279
|
# In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)",
|
|
216
280
|
# the `this` on the INSERT part isn't a table.
|
|
217
281
|
if isinstance(expr.this, sqlglot.exp.Table)
|
|
218
|
-
|
|
219
|
-
|
|
|
282
|
+
)
|
|
283
|
+
| _extract_table_names(
|
|
220
284
|
# For statements that include a column list, like
|
|
221
285
|
# CREATE DDL statements and `INSERT INTO table (col1, col2) SELECT ...`
|
|
222
286
|
# the table name is nested inside a Schema object.
|
|
223
|
-
|
|
287
|
+
expr.this.this
|
|
224
288
|
for expr in statement.find_all(
|
|
225
289
|
sqlglot.exp.Create,
|
|
226
290
|
sqlglot.exp.Insert,
|
|
227
291
|
)
|
|
228
292
|
if isinstance(expr.this, sqlglot.exp.Schema)
|
|
229
293
|
and isinstance(expr.this.this, sqlglot.exp.Table)
|
|
230
|
-
|
|
231
|
-
|
|
|
294
|
+
)
|
|
295
|
+
| _extract_table_names(
|
|
232
296
|
# For drop statements, we only want it if a table/view is being dropped.
|
|
233
297
|
# Other "kinds" will not have table.name populated.
|
|
234
|
-
|
|
298
|
+
expr.this
|
|
235
299
|
for expr in ([statement] if isinstance(statement, sqlglot.exp.Drop) else [])
|
|
236
300
|
if isinstance(expr.this, sqlglot.exp.Table)
|
|
237
301
|
and expr.this.this
|
|
238
302
|
and expr.this.name
|
|
239
|
-
|
|
303
|
+
)
|
|
240
304
|
)
|
|
241
305
|
|
|
242
306
|
tables = (
|
|
243
|
-
|
|
244
|
-
|
|
307
|
+
_extract_table_names(
|
|
308
|
+
table
|
|
245
309
|
for table in statement.find_all(sqlglot.exp.Table)
|
|
246
310
|
if not isinstance(table.parent, sqlglot.exp.Drop)
|
|
247
|
-
|
|
311
|
+
)
|
|
248
312
|
# ignore references created in this query
|
|
249
313
|
- modified
|
|
250
314
|
# ignore CTEs created in this statement
|
|
@@ -479,16 +543,19 @@ def _select_statement_cll(
|
|
|
479
543
|
root_scope: sqlglot.optimizer.Scope,
|
|
480
544
|
column_resolver: _ColumnResolver,
|
|
481
545
|
output_table: Optional[_TableName],
|
|
546
|
+
table_name_schema_mapping: Dict[_TableName, SchemaInfo],
|
|
547
|
+
default_db: Optional[str] = None,
|
|
548
|
+
default_schema: Optional[str] = None,
|
|
482
549
|
) -> List[_ColumnLineageInfo]:
|
|
483
550
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
484
551
|
|
|
485
552
|
try:
|
|
486
|
-
# List output columns.
|
|
487
553
|
output_columns = [
|
|
488
554
|
(select_col.alias_or_name, select_col) for select_col in statement.selects
|
|
489
555
|
]
|
|
490
556
|
logger.debug("output columns: %s", [col[0] for col in output_columns])
|
|
491
|
-
|
|
557
|
+
|
|
558
|
+
for output_col, _original_col_expression in output_columns:
|
|
492
559
|
if not output_col or output_col == "*":
|
|
493
560
|
# If schema information is available, the * will be expanded to the actual columns.
|
|
494
561
|
# Otherwise, we can't process it.
|
|
@@ -512,15 +579,14 @@ def _select_statement_cll(
|
|
|
512
579
|
trim_selects=False,
|
|
513
580
|
# We don't need to pass the schema in here, since we've already qualified the columns.
|
|
514
581
|
)
|
|
515
|
-
# import pathlib
|
|
516
|
-
# pathlib.Path("sqlglot.html").write_text(
|
|
517
|
-
# str(lineage_node.to_html(dialect=dialect))
|
|
518
|
-
# )
|
|
519
582
|
|
|
520
583
|
# Generate SELECT lineage.
|
|
521
|
-
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
522
|
-
|
|
523
|
-
|
|
584
|
+
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
585
|
+
lineage_node,
|
|
586
|
+
dialect,
|
|
587
|
+
default_db,
|
|
588
|
+
default_schema,
|
|
589
|
+
)
|
|
524
590
|
|
|
525
591
|
# Fuzzy resolve the output column.
|
|
526
592
|
original_col_expression = lineage_node.expression
|
|
@@ -539,7 +605,7 @@ def _select_statement_cll(
|
|
|
539
605
|
if original_col_expression.type:
|
|
540
606
|
output_col_type = original_col_expression.type
|
|
541
607
|
|
|
542
|
-
#
|
|
608
|
+
# Resolve upstream columns - table names should already be qualified from placeholder processing
|
|
543
609
|
direct_resolved_col_upstreams = {
|
|
544
610
|
_ColumnRef(
|
|
545
611
|
table=edge.table,
|
|
@@ -560,7 +626,7 @@ def _select_statement_cll(
|
|
|
560
626
|
column_type=output_col_type,
|
|
561
627
|
),
|
|
562
628
|
upstreams=sorted(direct_resolved_col_upstreams),
|
|
563
|
-
|
|
629
|
+
logic=_get_column_transformation(lineage_node, dialect),
|
|
564
630
|
)
|
|
565
631
|
)
|
|
566
632
|
|
|
@@ -575,6 +641,7 @@ def _select_statement_cll(
|
|
|
575
641
|
|
|
576
642
|
class _ColumnLineageWithDebugInfo(_ParserBaseModel):
|
|
577
643
|
column_lineage: List[_ColumnLineageInfo]
|
|
644
|
+
joins: Optional[List[_JoinInfo]] = None
|
|
578
645
|
|
|
579
646
|
select_statement: Optional[sqlglot.exp.Expression] = None
|
|
580
647
|
# TODO: Add column exceptions here.
|
|
@@ -624,6 +691,13 @@ def _column_level_lineage(
|
|
|
624
691
|
select_statement=select_statement,
|
|
625
692
|
)
|
|
626
693
|
|
|
694
|
+
# Handle VALUES expressions separately - they have no upstream tables and no column lineage
|
|
695
|
+
if isinstance(select_statement, sqlglot.exp.Values):
|
|
696
|
+
return _ColumnLineageWithDebugInfo(
|
|
697
|
+
column_lineage=[],
|
|
698
|
+
select_statement=select_statement,
|
|
699
|
+
)
|
|
700
|
+
|
|
627
701
|
assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
|
|
628
702
|
try:
|
|
629
703
|
root_scope = sqlglot.optimizer.build_scope(select_statement)
|
|
@@ -643,19 +717,35 @@ def _column_level_lineage(
|
|
|
643
717
|
root_scope=root_scope,
|
|
644
718
|
column_resolver=column_resolver,
|
|
645
719
|
output_table=downstream_table,
|
|
720
|
+
table_name_schema_mapping=table_name_schema_mapping,
|
|
721
|
+
default_db=default_db,
|
|
722
|
+
default_schema=default_schema,
|
|
646
723
|
)
|
|
647
724
|
|
|
725
|
+
joins: Optional[List[_JoinInfo]] = None
|
|
726
|
+
try:
|
|
727
|
+
# List join clauses.
|
|
728
|
+
joins = _list_joins(dialect=dialect, root_scope=root_scope)
|
|
729
|
+
logger.debug("Joins: %s", joins)
|
|
730
|
+
except Exception as e:
|
|
731
|
+
# This is a non-fatal error, so we can continue.
|
|
732
|
+
logger.debug("Failed to list joins: %s", e)
|
|
733
|
+
|
|
648
734
|
return _ColumnLineageWithDebugInfo(
|
|
649
735
|
column_lineage=column_lineage,
|
|
736
|
+
joins=joins,
|
|
650
737
|
select_statement=select_statement,
|
|
651
738
|
)
|
|
652
739
|
|
|
653
740
|
|
|
654
741
|
def _get_direct_raw_col_upstreams(
|
|
655
742
|
lineage_node: sqlglot.lineage.Node,
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
743
|
+
dialect: Optional[sqlglot.Dialect] = None,
|
|
744
|
+
default_db: Optional[str] = None,
|
|
745
|
+
default_schema: Optional[str] = None,
|
|
746
|
+
) -> OrderedSet[_ColumnRef]:
|
|
747
|
+
# Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
|
|
748
|
+
direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
|
|
659
749
|
|
|
660
750
|
for node in lineage_node.walk():
|
|
661
751
|
if node.downstream:
|
|
@@ -682,6 +772,53 @@ def _get_direct_raw_col_upstreams(
|
|
|
682
772
|
direct_raw_col_upstreams.add(
|
|
683
773
|
_ColumnRef(table=table_ref, column=normalized_col)
|
|
684
774
|
)
|
|
775
|
+
elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
|
|
776
|
+
# Handle placeholder expressions from lateral joins.
|
|
777
|
+
#
|
|
778
|
+
# In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
|
|
779
|
+
# expressions instead of regular table references. This is critical for lateral join column lineage.
|
|
780
|
+
#
|
|
781
|
+
# Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
|
|
782
|
+
# The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
|
|
783
|
+
# which we need to parse to establish the lineage: output.value <- my_table2.value
|
|
784
|
+
#
|
|
785
|
+
# Without this handling, lateral join column lineage would be incomplete/missing.
|
|
786
|
+
try:
|
|
787
|
+
parsed = sqlglot.parse_one(node.name, dialect=dialect)
|
|
788
|
+
if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
|
|
789
|
+
table_ref = _TableName.from_sqlglot_table(
|
|
790
|
+
sqlglot.parse_one(
|
|
791
|
+
parsed.table, into=sqlglot.exp.Table, dialect=dialect
|
|
792
|
+
)
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
|
|
796
|
+
# Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
|
|
797
|
+
# We need this runtime qualification to ensure proper lineage resolution.
|
|
798
|
+
# Only qualify if this appears to be a real table reference (not a temporary construct)
|
|
799
|
+
if (
|
|
800
|
+
not (table_ref.database or table_ref.db_schema)
|
|
801
|
+
and dialect is not None
|
|
802
|
+
):
|
|
803
|
+
table_ref = table_ref.qualified(
|
|
804
|
+
dialect=dialect,
|
|
805
|
+
default_db=default_db,
|
|
806
|
+
default_schema=default_schema,
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# Extract column name using proper isinstance check
|
|
810
|
+
if isinstance(parsed.this, sqlglot.exp.Identifier):
|
|
811
|
+
column_name = parsed.this.name
|
|
812
|
+
else:
|
|
813
|
+
column_name = str(parsed.this)
|
|
814
|
+
direct_raw_col_upstreams.add(
|
|
815
|
+
_ColumnRef(table=table_ref, column=column_name)
|
|
816
|
+
)
|
|
817
|
+
except Exception as e:
|
|
818
|
+
logger.debug(
|
|
819
|
+
f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
|
|
820
|
+
exc_info=True,
|
|
821
|
+
)
|
|
685
822
|
else:
|
|
686
823
|
# This branch doesn't matter. For example, a count(*) column would go here, and
|
|
687
824
|
# we don't get any column-level lineage for that.
|
|
@@ -690,6 +827,268 @@ def _get_direct_raw_col_upstreams(
|
|
|
690
827
|
return direct_raw_col_upstreams
|
|
691
828
|
|
|
692
829
|
|
|
830
|
+
def _is_single_column_expression(
|
|
831
|
+
expression: sqlglot.exp.Expression,
|
|
832
|
+
) -> bool:
|
|
833
|
+
# Check if the expression is trivial, i.e. it's just a single column.
|
|
834
|
+
# Things like count(*) or coalesce(col, 0) are not single columns.
|
|
835
|
+
if isinstance(expression, sqlglot.exp.Alias):
|
|
836
|
+
expression = expression.this
|
|
837
|
+
|
|
838
|
+
return isinstance(expression, sqlglot.exp.Column)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def _get_column_transformation(
|
|
842
|
+
lineage_node: sqlglot.lineage.Node,
|
|
843
|
+
dialect: sqlglot.Dialect,
|
|
844
|
+
parent: Optional[sqlglot.lineage.Node] = None,
|
|
845
|
+
) -> ColumnTransformation:
|
|
846
|
+
# expression = lineage_node.expression
|
|
847
|
+
# is_single_column_expression = _is_single_column_expression(lineage_node.expression)
|
|
848
|
+
if not lineage_node.downstream:
|
|
849
|
+
# parent_expression = parent.expression if parent else expression
|
|
850
|
+
if parent:
|
|
851
|
+
expression = parent.expression
|
|
852
|
+
is_copy = _is_single_column_expression(expression)
|
|
853
|
+
else:
|
|
854
|
+
# This case should rarely happen.
|
|
855
|
+
is_copy = True
|
|
856
|
+
expression = lineage_node.expression
|
|
857
|
+
return ColumnTransformation(
|
|
858
|
+
is_direct_copy=is_copy,
|
|
859
|
+
column_logic=expression.sql(dialect=dialect),
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
elif len(lineage_node.downstream) > 1 or not _is_single_column_expression(
|
|
863
|
+
lineage_node.expression
|
|
864
|
+
):
|
|
865
|
+
return ColumnTransformation(
|
|
866
|
+
is_direct_copy=False,
|
|
867
|
+
column_logic=lineage_node.expression.sql(dialect=dialect),
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
else:
|
|
871
|
+
return _get_column_transformation(
|
|
872
|
+
lineage_node=lineage_node.downstream[0],
|
|
873
|
+
dialect=dialect,
|
|
874
|
+
parent=lineage_node,
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def _get_join_side_tables(
|
|
879
|
+
target: sqlglot.exp.Expression,
|
|
880
|
+
dialect: sqlglot.Dialect,
|
|
881
|
+
scope: sqlglot.optimizer.Scope,
|
|
882
|
+
) -> OrderedSet[_TableName]:
|
|
883
|
+
target_alias_or_name = target.alias_or_name
|
|
884
|
+
if (source := scope.sources.get(target_alias_or_name)) and isinstance(
|
|
885
|
+
source, sqlglot.exp.Table
|
|
886
|
+
):
|
|
887
|
+
# If the source is a Scope, we need to do some resolution work.
|
|
888
|
+
return OrderedSet([_TableName.from_sqlglot_table(source)])
|
|
889
|
+
|
|
890
|
+
column = sqlglot.exp.Column(
|
|
891
|
+
this=sqlglot.exp.Star(),
|
|
892
|
+
table=sqlglot.exp.Identifier(this=target.alias_or_name),
|
|
893
|
+
)
|
|
894
|
+
columns_used = _get_raw_col_upstreams_for_expression(
|
|
895
|
+
select=column,
|
|
896
|
+
dialect=dialect,
|
|
897
|
+
scope=scope,
|
|
898
|
+
)
|
|
899
|
+
return OrderedSet(col.table for col in columns_used)
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def _get_raw_col_upstreams_for_expression(
|
|
903
|
+
select: sqlglot.exp.Expression,
|
|
904
|
+
dialect: sqlglot.Dialect,
|
|
905
|
+
scope: sqlglot.optimizer.Scope,
|
|
906
|
+
) -> OrderedSet[_ColumnRef]:
|
|
907
|
+
if not isinstance(scope.expression, sqlglot.exp.Query):
|
|
908
|
+
# Note that Select, Subquery, SetOperation, etc. are all subclasses of Query.
|
|
909
|
+
# So this line should basically never happen.
|
|
910
|
+
return OrderedSet()
|
|
911
|
+
|
|
912
|
+
original_expression = scope.expression
|
|
913
|
+
updated_expression = scope.expression.select(select, append=False, copy=True)
|
|
914
|
+
|
|
915
|
+
try:
|
|
916
|
+
scope.expression = updated_expression
|
|
917
|
+
node = sqlglot.lineage.to_node(
|
|
918
|
+
column=0,
|
|
919
|
+
scope=scope,
|
|
920
|
+
dialect=dialect,
|
|
921
|
+
trim_selects=False,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
return _get_direct_raw_col_upstreams(node, dialect, None, None)
|
|
925
|
+
finally:
|
|
926
|
+
scope.expression = original_expression
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _list_joins(
|
|
930
|
+
dialect: sqlglot.Dialect,
|
|
931
|
+
root_scope: sqlglot.optimizer.Scope,
|
|
932
|
+
) -> List[_JoinInfo]:
|
|
933
|
+
# TODO: Add a confidence tracker here.
|
|
934
|
+
|
|
935
|
+
joins: List[_JoinInfo] = []
|
|
936
|
+
|
|
937
|
+
scope: sqlglot.optimizer.Scope
|
|
938
|
+
for scope in root_scope.traverse():
|
|
939
|
+
# PART 1: Handle regular explicit JOINs (updated API)
|
|
940
|
+
join: sqlglot.exp.Join
|
|
941
|
+
for join in scope.expression.find_all(sqlglot.exp.Join):
|
|
942
|
+
left_side_tables: OrderedSet[_TableName] = OrderedSet()
|
|
943
|
+
from_clause: sqlglot.exp.From
|
|
944
|
+
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
945
|
+
left_side_tables.update(
|
|
946
|
+
_get_join_side_tables(
|
|
947
|
+
target=from_clause.this,
|
|
948
|
+
dialect=dialect,
|
|
949
|
+
scope=scope,
|
|
950
|
+
)
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
right_side_tables: OrderedSet[_TableName] = OrderedSet()
|
|
954
|
+
if join_target := join.this:
|
|
955
|
+
right_side_tables = _get_join_side_tables(
|
|
956
|
+
target=join_target,
|
|
957
|
+
dialect=dialect,
|
|
958
|
+
scope=scope,
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
# We don't need to check for `using` here because it's normalized to `on`
|
|
962
|
+
# by the sqlglot optimizer.
|
|
963
|
+
on_clause: Optional[sqlglot.exp.Expression] = join.args.get("on")
|
|
964
|
+
if on_clause:
|
|
965
|
+
joined_columns = _get_raw_col_upstreams_for_expression(
|
|
966
|
+
select=on_clause, dialect=dialect, scope=scope
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
unique_tables = OrderedSet(col.table for col in joined_columns)
|
|
970
|
+
if not unique_tables:
|
|
971
|
+
logger.debug(
|
|
972
|
+
"Skipping join because we couldn't resolve the tables from the join condition: %s",
|
|
973
|
+
join.sql(dialect=dialect),
|
|
974
|
+
)
|
|
975
|
+
continue
|
|
976
|
+
|
|
977
|
+
# When we have an `on` clause, we only want to include tables whose columns are
|
|
978
|
+
# involved in the join condition. Without this, a statement like this:
|
|
979
|
+
# WITH cte_alias AS (select t1.id, t1.user_id, t2.other_col from t1 join t2 on t1.id = t2.id)
|
|
980
|
+
# SELECT * FROM users
|
|
981
|
+
# JOIN cte_alias ON users.id = cte_alias.user_id
|
|
982
|
+
# would incorrectly include t2 as part of the left side tables.
|
|
983
|
+
left_side_tables = OrderedSet(left_side_tables & unique_tables)
|
|
984
|
+
right_side_tables = OrderedSet(right_side_tables & unique_tables)
|
|
985
|
+
else:
|
|
986
|
+
# Some joins (cross join, lateral join, etc.) don't have an ON clause.
|
|
987
|
+
# In those cases, we have some best-effort logic at least extract the
|
|
988
|
+
# tables involved.
|
|
989
|
+
joined_columns = OrderedSet()
|
|
990
|
+
|
|
991
|
+
if not left_side_tables and not right_side_tables:
|
|
992
|
+
logger.debug(
|
|
993
|
+
"Skipping join because we couldn't resolve any tables from the join operands: %s",
|
|
994
|
+
join.sql(dialect=dialect),
|
|
995
|
+
)
|
|
996
|
+
continue
|
|
997
|
+
elif len(left_side_tables | right_side_tables) == 1:
|
|
998
|
+
# When we don't have an ON clause, we're more strict about the
|
|
999
|
+
# minimum number of tables we need to resolve to avoid false positives.
|
|
1000
|
+
# On the off chance someone is doing a self-cross-join, we'll miss it.
|
|
1001
|
+
logger.debug(
|
|
1002
|
+
"Skipping join because we couldn't resolve enough tables from the join operands: %s",
|
|
1003
|
+
join.sql(dialect=dialect),
|
|
1004
|
+
)
|
|
1005
|
+
continue
|
|
1006
|
+
|
|
1007
|
+
joins.append(
|
|
1008
|
+
_JoinInfo(
|
|
1009
|
+
join_type=_get_join_type(join),
|
|
1010
|
+
left_tables=list(left_side_tables),
|
|
1011
|
+
right_tables=list(right_side_tables),
|
|
1012
|
+
on_clause=on_clause.sql(dialect=dialect) if on_clause else None,
|
|
1013
|
+
columns_involved=list(sorted(joined_columns)),
|
|
1014
|
+
)
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
# Handle LATERAL constructs
|
|
1018
|
+
for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
|
|
1019
|
+
# Get tables from non-lateral FROM clauses
|
|
1020
|
+
qualified_left: OrderedSet[_TableName] = OrderedSet()
|
|
1021
|
+
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
1022
|
+
if not isinstance(from_clause.this, sqlglot.exp.Lateral):
|
|
1023
|
+
qualified_left.update(
|
|
1024
|
+
_get_join_side_tables(from_clause.this, dialect, scope)
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
# Get tables from lateral subquery
|
|
1028
|
+
qualified_right: OrderedSet[_TableName] = OrderedSet()
|
|
1029
|
+
if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
|
|
1030
|
+
qualified_right.update(
|
|
1031
|
+
_TableName.from_sqlglot_table(t)
|
|
1032
|
+
for t in lateral.this.find_all(sqlglot.exp.Table)
|
|
1033
|
+
)
|
|
1034
|
+
qualified_right.update(qualified_left)
|
|
1035
|
+
|
|
1036
|
+
if qualified_left and qualified_right:
|
|
1037
|
+
joins.append(
|
|
1038
|
+
_JoinInfo(
|
|
1039
|
+
join_type="LATERAL JOIN",
|
|
1040
|
+
left_tables=list(qualified_left),
|
|
1041
|
+
right_tables=list(qualified_right),
|
|
1042
|
+
on_clause=None,
|
|
1043
|
+
columns_involved=[],
|
|
1044
|
+
)
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
return joins
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def _get_join_type(join: sqlglot.exp.Join) -> str:
|
|
1051
|
+
"""Returns the type of join as a string.
|
|
1052
|
+
|
|
1053
|
+
Args:
|
|
1054
|
+
join: A sqlglot Join expression.
|
|
1055
|
+
|
|
1056
|
+
Returns:
|
|
1057
|
+
Stringified join type e.g. "LEFT JOIN", "RIGHT OUTER JOIN", "LATERAL JOIN", etc.
|
|
1058
|
+
"""
|
|
1059
|
+
# This logic was derived from the sqlglot join_sql method.
|
|
1060
|
+
# https://github.com/tobymao/sqlglot/blob/07bf71bae5d2a5c381104a86bb52c06809c21174/sqlglot/generator.py#L2248
|
|
1061
|
+
|
|
1062
|
+
# Special case for lateral joins
|
|
1063
|
+
if isinstance(join.this, sqlglot.exp.Lateral):
|
|
1064
|
+
if join.this.args.get("cross_apply") is not None:
|
|
1065
|
+
return "CROSS APPLY"
|
|
1066
|
+
return "LATERAL JOIN"
|
|
1067
|
+
|
|
1068
|
+
# Special case for STRAIGHT_JOIN (MySQL)
|
|
1069
|
+
if join.args.get("kind") == "STRAIGHT":
|
|
1070
|
+
return "STRAIGHT_JOIN"
|
|
1071
|
+
|
|
1072
|
+
# <method> <global> <side> <kind> JOIN
|
|
1073
|
+
# - method = "HASH", "MERGE"
|
|
1074
|
+
# - global = "GLOBAL"
|
|
1075
|
+
# - side = "LEFT", "RIGHT"
|
|
1076
|
+
# - kind = "INNER", "OUTER", "SEMI", "ANTI"
|
|
1077
|
+
components = []
|
|
1078
|
+
if method := join.args.get("method"):
|
|
1079
|
+
components.append(method)
|
|
1080
|
+
if join.args.get("global"):
|
|
1081
|
+
components.append("GLOBAL")
|
|
1082
|
+
if side := join.args.get("side"):
|
|
1083
|
+
# For SEMI/ANTI joins, side is optional
|
|
1084
|
+
components.append(side)
|
|
1085
|
+
if kind := join.args.get("kind"):
|
|
1086
|
+
components.append(kind)
|
|
1087
|
+
|
|
1088
|
+
components.append("JOIN")
|
|
1089
|
+
return " ".join(components)
|
|
1090
|
+
|
|
1091
|
+
|
|
693
1092
|
def _extract_select_from_create(
|
|
694
1093
|
statement: sqlglot.exp.Create,
|
|
695
1094
|
) -> sqlglot.exp.Expression:
|
|
@@ -784,7 +1183,12 @@ def _try_extract_select(
|
|
|
784
1183
|
statement = sqlglot.exp.Select().select("*").from_(statement)
|
|
785
1184
|
elif isinstance(statement, sqlglot.exp.Insert):
|
|
786
1185
|
# TODO Need to map column renames in the expressions part of the statement.
|
|
787
|
-
|
|
1186
|
+
# Preserve CTEs when extracting the SELECT expression from INSERT
|
|
1187
|
+
original_ctes = statement.ctes
|
|
1188
|
+
statement = statement.expression # Get the SELECT expression from the INSERT
|
|
1189
|
+
if isinstance(statement, sqlglot.exp.Query) and original_ctes:
|
|
1190
|
+
for cte in original_ctes:
|
|
1191
|
+
statement = statement.with_(alias=cte.alias, as_=cte.this)
|
|
788
1192
|
elif isinstance(statement, sqlglot.exp.Update):
|
|
789
1193
|
# Assumption: the output table is already captured in the modified tables list.
|
|
790
1194
|
statement = _extract_select_from_update(statement)
|
|
@@ -875,6 +1279,40 @@ def _translate_internal_column_lineage(
|
|
|
875
1279
|
)
|
|
876
1280
|
|
|
877
1281
|
|
|
1282
|
+
def _translate_internal_joins(
|
|
1283
|
+
table_name_urn_mapping: Dict[_TableName, str],
|
|
1284
|
+
raw_joins: List[_JoinInfo],
|
|
1285
|
+
dialect: sqlglot.Dialect,
|
|
1286
|
+
) -> List[JoinInfo]:
|
|
1287
|
+
joins = []
|
|
1288
|
+
for raw_join in raw_joins:
|
|
1289
|
+
try:
|
|
1290
|
+
joins.append(
|
|
1291
|
+
JoinInfo(
|
|
1292
|
+
join_type=raw_join.join_type,
|
|
1293
|
+
left_tables=[
|
|
1294
|
+
table_name_urn_mapping[table] for table in raw_join.left_tables
|
|
1295
|
+
],
|
|
1296
|
+
right_tables=[
|
|
1297
|
+
table_name_urn_mapping[table] for table in raw_join.right_tables
|
|
1298
|
+
],
|
|
1299
|
+
on_clause=raw_join.on_clause,
|
|
1300
|
+
columns_involved=[
|
|
1301
|
+
ColumnRef(
|
|
1302
|
+
table=table_name_urn_mapping[col.table],
|
|
1303
|
+
column=col.column,
|
|
1304
|
+
)
|
|
1305
|
+
for col in raw_join.columns_involved
|
|
1306
|
+
],
|
|
1307
|
+
)
|
|
1308
|
+
)
|
|
1309
|
+
except KeyError as e:
|
|
1310
|
+
# Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
|
|
1311
|
+
logger.debug(f"Skipping join with unresolvable table: {e}")
|
|
1312
|
+
continue
|
|
1313
|
+
return joins
|
|
1314
|
+
|
|
1315
|
+
|
|
878
1316
|
_StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
|
|
879
1317
|
|
|
880
1318
|
|
|
@@ -923,12 +1361,12 @@ def _sqlglot_lineage_inner(
|
|
|
923
1361
|
schema_resolver: SchemaResolverInterface,
|
|
924
1362
|
default_db: Optional[str] = None,
|
|
925
1363
|
default_schema: Optional[str] = None,
|
|
926
|
-
|
|
1364
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
927
1365
|
) -> SqlParsingResult:
|
|
928
|
-
if
|
|
929
|
-
dialect = get_dialect(
|
|
1366
|
+
if override_dialect:
|
|
1367
|
+
dialect = get_dialect(override_dialect)
|
|
930
1368
|
else:
|
|
931
|
-
dialect = get_dialect(
|
|
1369
|
+
dialect = get_dialect(schema_resolver.platform)
|
|
932
1370
|
|
|
933
1371
|
default_db = _normalize_db_or_schema(default_db, dialect)
|
|
934
1372
|
default_schema = _normalize_db_or_schema(default_schema, dialect)
|
|
@@ -1034,6 +1472,7 @@ def _sqlglot_lineage_inner(
|
|
|
1034
1472
|
)
|
|
1035
1473
|
|
|
1036
1474
|
column_lineage: Optional[List[_ColumnLineageInfo]] = None
|
|
1475
|
+
joins = None
|
|
1037
1476
|
try:
|
|
1038
1477
|
with cooperative_timeout(
|
|
1039
1478
|
timeout=(
|
|
@@ -1049,6 +1488,7 @@ def _sqlglot_lineage_inner(
|
|
|
1049
1488
|
default_schema=default_schema,
|
|
1050
1489
|
)
|
|
1051
1490
|
column_lineage = column_lineage_debug_info.column_lineage
|
|
1491
|
+
joins = column_lineage_debug_info.joins
|
|
1052
1492
|
except CooperativeTimeoutError as e:
|
|
1053
1493
|
logger.debug(f"Timed out while generating column-level lineage: {e}")
|
|
1054
1494
|
debug_info.column_error = e
|
|
@@ -1081,6 +1521,14 @@ def _sqlglot_lineage_inner(
|
|
|
1081
1521
|
f"Failed to translate column lineage to urns: {e}", exc_info=True
|
|
1082
1522
|
)
|
|
1083
1523
|
debug_info.column_error = e
|
|
1524
|
+
joins_urns = None
|
|
1525
|
+
if joins is not None:
|
|
1526
|
+
try:
|
|
1527
|
+
joins_urns = _translate_internal_joins(
|
|
1528
|
+
table_name_urn_mapping, raw_joins=joins, dialect=dialect
|
|
1529
|
+
)
|
|
1530
|
+
except KeyError as e:
|
|
1531
|
+
logger.debug(f"Failed to translate joins to urns: {e}", exc_info=True)
|
|
1084
1532
|
|
|
1085
1533
|
query_type, query_type_props = get_query_type_of_sql(
|
|
1086
1534
|
original_statement, dialect=dialect
|
|
@@ -1095,6 +1543,7 @@ def _sqlglot_lineage_inner(
|
|
|
1095
1543
|
in_tables=in_urns,
|
|
1096
1544
|
out_tables=out_urns,
|
|
1097
1545
|
column_lineage=column_lineage_urns,
|
|
1546
|
+
joins=joins_urns,
|
|
1098
1547
|
debug_info=debug_info,
|
|
1099
1548
|
)
|
|
1100
1549
|
|
|
@@ -1104,7 +1553,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1104
1553
|
schema_resolver: SchemaResolverInterface,
|
|
1105
1554
|
default_db: Optional[str] = None,
|
|
1106
1555
|
default_schema: Optional[str] = None,
|
|
1107
|
-
|
|
1556
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1108
1557
|
) -> SqlParsingResult:
|
|
1109
1558
|
"""Parse a SQL statement and generate lineage information.
|
|
1110
1559
|
|
|
@@ -1122,8 +1571,8 @@ def _sqlglot_lineage_nocache(
|
|
|
1122
1571
|
can be brittle with respect to missing schema information and complex
|
|
1123
1572
|
SQL logic like UNNESTs.
|
|
1124
1573
|
|
|
1125
|
-
The SQL dialect
|
|
1126
|
-
be
|
|
1574
|
+
The SQL dialect will be inferred from the schema_resolver's platform.
|
|
1575
|
+
That inference can be overridden by passing an override_dialect argument.
|
|
1127
1576
|
The set of supported dialects is the same as sqlglot's. See their
|
|
1128
1577
|
`documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
|
|
1129
1578
|
for the full list.
|
|
@@ -1138,7 +1587,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1138
1587
|
schema_resolver: The schema resolver to use for resolving table schemas.
|
|
1139
1588
|
default_db: The default database to use for unqualified table names.
|
|
1140
1589
|
default_schema: The default schema to use for unqualified table names.
|
|
1141
|
-
|
|
1590
|
+
override_dialect: Override the dialect provided by 'schema_resolver'.
|
|
1142
1591
|
|
|
1143
1592
|
Returns:
|
|
1144
1593
|
A SqlParsingResult object containing the parsed lineage information.
|
|
@@ -1163,10 +1612,32 @@ def _sqlglot_lineage_nocache(
|
|
|
1163
1612
|
schema_resolver=schema_resolver,
|
|
1164
1613
|
default_db=default_db,
|
|
1165
1614
|
default_schema=default_schema,
|
|
1166
|
-
|
|
1615
|
+
override_dialect=override_dialect,
|
|
1167
1616
|
)
|
|
1168
1617
|
except Exception as e:
|
|
1169
1618
|
return SqlParsingResult.make_from_error(e)
|
|
1619
|
+
except BaseException as e:
|
|
1620
|
+
# Check if this is a PanicException from SQLGlot's Rust tokenizer
|
|
1621
|
+
# We use runtime type checking instead of isinstance() because pyo3_runtime
|
|
1622
|
+
# is only available when sqlglot[rs] is installed and may not be importable
|
|
1623
|
+
# at module load time, but the exception can still be raised at runtime
|
|
1624
|
+
if (
|
|
1625
|
+
e.__class__.__name__ == "PanicException"
|
|
1626
|
+
and e.__class__.__module__ == "pyo3_runtime"
|
|
1627
|
+
):
|
|
1628
|
+
# Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
|
|
1629
|
+
# pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
|
|
1630
|
+
# KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
|
|
1631
|
+
# Avoid catching BaseException, as it includes KeyboardInterrupt
|
|
1632
|
+
# and would prevent Ctrl+C from working.
|
|
1633
|
+
wrapped_exception = Exception(
|
|
1634
|
+
f"pyo3_runtime.PanicException during SQL parsing: {e}"
|
|
1635
|
+
)
|
|
1636
|
+
wrapped_exception.__cause__ = e
|
|
1637
|
+
return SqlParsingResult.make_from_error(wrapped_exception)
|
|
1638
|
+
else:
|
|
1639
|
+
# Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
|
|
1640
|
+
raise
|
|
1170
1641
|
|
|
1171
1642
|
|
|
1172
1643
|
_sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
|
|
@@ -1179,15 +1650,15 @@ def sqlglot_lineage(
|
|
|
1179
1650
|
schema_resolver: SchemaResolverInterface,
|
|
1180
1651
|
default_db: Optional[str] = None,
|
|
1181
1652
|
default_schema: Optional[str] = None,
|
|
1182
|
-
|
|
1653
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1183
1654
|
) -> SqlParsingResult:
|
|
1184
1655
|
if schema_resolver.includes_temp_tables():
|
|
1185
1656
|
return _sqlglot_lineage_nocache(
|
|
1186
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1657
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1187
1658
|
)
|
|
1188
1659
|
else:
|
|
1189
1660
|
return _sqlglot_lineage_cached(
|
|
1190
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1661
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1191
1662
|
)
|
|
1192
1663
|
|
|
1193
1664
|
|
|
@@ -1239,6 +1710,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1239
1710
|
default_schema: Optional[str] = None,
|
|
1240
1711
|
graph: Optional[DataHubGraph] = None,
|
|
1241
1712
|
schema_aware: bool = True,
|
|
1713
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1242
1714
|
) -> SqlParsingResult:
|
|
1243
1715
|
schema_resolver = create_schema_resolver(
|
|
1244
1716
|
platform=platform,
|
|
@@ -1258,6 +1730,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1258
1730
|
schema_resolver=schema_resolver,
|
|
1259
1731
|
default_db=default_db,
|
|
1260
1732
|
default_schema=default_schema,
|
|
1733
|
+
override_dialect=override_dialect,
|
|
1261
1734
|
)
|
|
1262
1735
|
except Exception as e:
|
|
1263
1736
|
return SqlParsingResult.make_from_error(e)
|