PyPI - acryl-datahub - Versions diffs - 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.3.0.1rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show

{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/common/serialized_value.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/datacontract.py +35 -3
datahub/api/entities/datajob/dataflow.py +18 -3
datahub/api/entities/datajob/datajob.py +24 -4
datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
datahub/api/entities/dataproduct/dataproduct.py +32 -3
datahub/api/entities/dataset/dataset.py +47 -72
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +724 -0
datahub/api/entities/external/external_tag.py +147 -0
datahub/api/entities/external/lake_formation_external_entites.py +162 -0
datahub/api/entities/external/restricted_text.py +172 -0
datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
datahub/api/entities/forms/forms.py +37 -37
datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/base.py +8 -6
datahub/api/graphql/operation.py +14 -10
datahub/cli/check_cli.py +91 -9
datahub/cli/cli_utils.py +63 -0
datahub/cli/config_utils.py +20 -12
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +133 -34
datahub/cli/docker_check.py +110 -14
datahub/cli/docker_cli.py +155 -231
datahub/cli/exists_cli.py +2 -3
datahub/cli/get_cli.py +2 -3
datahub/cli/graphql_cli.py +1422 -0
datahub/cli/iceberg_cli.py +11 -5
datahub/cli/ingest_cli.py +25 -26
datahub/cli/migrate.py +12 -9
datahub/cli/migration_utils.py +4 -3
datahub/cli/put_cli.py +4 -6
datahub/cli/quickstart_versioning.py +53 -10
datahub/cli/specific/assertions_cli.py +39 -7
datahub/cli/specific/datacontract_cli.py +57 -9
datahub/cli/specific/dataproduct_cli.py +12 -24
datahub/cli/specific/dataset_cli.py +31 -21
datahub/cli/specific/forms_cli.py +2 -5
datahub/cli/specific/group_cli.py +2 -3
datahub/cli/specific/structuredproperties_cli.py +5 -7
datahub/cli/specific/user_cli.py +174 -4
datahub/cli/state_cli.py +2 -3
datahub/cli/timeline_cli.py +2 -3
datahub/configuration/common.py +46 -2
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/env_vars.py +331 -0
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/kafka.py +21 -1
datahub/configuration/pydantic_migration_helpers.py +6 -13
datahub/configuration/source_common.py +4 -3
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +8 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/emitter/mce_builder.py +12 -8
datahub/emitter/mcp.py +20 -5
datahub/emitter/mcp_builder.py +12 -0
datahub/emitter/request_helper.py +138 -15
datahub/emitter/response_helper.py +111 -19
datahub/emitter/rest_emitter.py +399 -163
datahub/entrypoints.py +10 -5
datahub/errors.py +12 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
datahub/ingestion/api/common.py +9 -0
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +381 -3
datahub/ingestion/api/sink.py +27 -2
datahub/ingestion/api/source.py +174 -62
datahub/ingestion/api/source_helpers.py +41 -3
datahub/ingestion/api/source_protocols.py +23 -0
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3652 -0
datahub/ingestion/autogenerated/lineage.json +402 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +31 -5
datahub/ingestion/glossary/classification_mixin.py +9 -2
datahub/ingestion/graph/client.py +492 -55
datahub/ingestion/graph/config.py +18 -2
datahub/ingestion/graph/filters.py +96 -32
datahub/ingestion/graph/links.py +55 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
datahub/ingestion/run/pipeline.py +90 -23
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/sink/datahub_kafka.py +1 -0
datahub/ingestion/sink/datahub_rest.py +31 -23
datahub/ingestion/sink/file.py +1 -0
datahub/ingestion/source/abs/config.py +1 -1
datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
datahub/ingestion/source/abs/source.py +15 -30
datahub/ingestion/source/apply/datahub_apply.py +6 -5
datahub/ingestion/source/aws/aws_common.py +185 -13
datahub/ingestion/source/aws/glue.py +517 -244
datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
datahub/ingestion/source/aws/tag_entities.py +270 -0
datahub/ingestion/source/azure/azure_common.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
datahub/ingestion/source/cassandra/cassandra.py +7 -18
datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
datahub/ingestion/source/common/subtypes.py +73 -1
datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
datahub/ingestion/source/data_lake_common/object_store.py +732 -0
datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
datahub/ingestion/source/datahub/config.py +19 -5
datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
datahub/ingestion/source/datahub/datahub_source.py +11 -1
datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
datahub/ingestion/source/dbt/dbt_common.py +270 -26
datahub/ingestion/source/dbt/dbt_core.py +88 -47
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/delta_lake/config.py +9 -5
datahub/ingestion/source/delta_lake/source.py +8 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_config.py +5 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
datahub/ingestion/source/dremio/dremio_source.py +228 -215
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/feast.py +12 -14
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/config.py +67 -8
datahub/ingestion/source/fivetran/fivetran.py +228 -43
datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
datahub/ingestion/source/fivetran/response_models.py +97 -0
datahub/ingestion/source/gc/datahub_gc.py +0 -2
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/gcs/gcs_source.py +53 -10
datahub/ingestion/source/gcs/gcs_utils.py +36 -9
datahub/ingestion/source/ge_data_profiler.py +146 -33
datahub/ingestion/source/ge_profiling_config.py +26 -11
datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
datahub/ingestion/source/grafana/field_utils.py +307 -0
datahub/ingestion/source/grafana/grafana_api.py +142 -0
datahub/ingestion/source/grafana/grafana_config.py +104 -0
datahub/ingestion/source/grafana/grafana_source.py +522 -84
datahub/ingestion/source/grafana/lineage.py +202 -0
datahub/ingestion/source/grafana/models.py +137 -0
datahub/ingestion/source/grafana/report.py +90 -0
datahub/ingestion/source/grafana/types.py +16 -0
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +402 -0
datahub/ingestion/source/hex/constants.py +8 -0
datahub/ingestion/source/hex/hex.py +311 -0
datahub/ingestion/source/hex/mapper.py +412 -0
datahub/ingestion/source/hex/model.py +78 -0
datahub/ingestion/source/hex/query_fetcher.py +307 -0
datahub/ingestion/source/iceberg/iceberg.py +385 -164
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +28 -71
datahub/ingestion/source/kafka/kafka_config.py +78 -0
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +216 -86
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_constant.py +4 -0
datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
datahub/ingestion/source/looker/looker_source.py +539 -555
datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
datahub/ingestion/source/looker/lookml_config.py +31 -3
datahub/ingestion/source/looker/lookml_refinement.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +103 -118
datahub/ingestion/source/looker/view_upstream.py +494 -1
datahub/ingestion/source/metabase.py +32 -6
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +11 -10
datahub/ingestion/source/mlflow.py +254 -23
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
datahub/ingestion/source/mode.py +359 -181
datahub/ingestion/source/mongodb.py +11 -1
datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
datahub/ingestion/source/nifi.py +5 -5
datahub/ingestion/source/openapi.py +85 -38
datahub/ingestion/source/openapi_parser.py +59 -40
datahub/ingestion/source/powerbi/config.py +92 -27
datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +66 -32
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
datahub/ingestion/source/preset.py +3 -3
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
datahub/ingestion/source/redash.py +1 -1
datahub/ingestion/source/redshift/config.py +15 -9
datahub/ingestion/source/redshift/datashares.py +1 -1
datahub/ingestion/source/redshift/lineage.py +386 -687
datahub/ingestion/source/redshift/profile.py +2 -2
datahub/ingestion/source/redshift/query.py +24 -20
datahub/ingestion/source/redshift/redshift.py +52 -111
datahub/ingestion/source/redshift/redshift_schema.py +17 -12
datahub/ingestion/source/redshift/report.py +0 -2
datahub/ingestion/source/redshift/usage.py +13 -11
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +515 -244
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +28 -13
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/schema_inference/object.py +22 -6
datahub/ingestion/source/sigma/config.py +75 -8
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/sigma/sigma.py +36 -7
datahub/ingestion/source/sigma/sigma_api.py +99 -58
datahub/ingestion/source/slack/slack.py +403 -140
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/constants.py +4 -0
datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +219 -26
datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
datahub/ingestion/source/sql/clickhouse.py +29 -9
datahub/ingestion/source/sql/cockroachdb.py +5 -4
datahub/ingestion/source/sql/druid.py +9 -4
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive.py +28 -8
datahub/ingestion/source/sql/hive_metastore.py +24 -25
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/job_models.py +18 -2
datahub/ingestion/source/sql/mssql/source.py +376 -62
datahub/ingestion/source/sql/mysql.py +154 -4
datahub/ingestion/source/sql/oracle.py +62 -11
datahub/ingestion/source/sql/postgres.py +142 -6
datahub/ingestion/source/sql/presto.py +20 -2
datahub/ingestion/source/sql/sql_common.py +281 -49
datahub/ingestion/source/sql/sql_config.py +1 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/sql_types.py +27 -2
datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
datahub/ingestion/source/sql/teradata.py +1028 -245
datahub/ingestion/source/sql/trino.py +43 -10
datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
datahub/ingestion/source/sql/vertica.py +14 -7
datahub/ingestion/source/sql_queries.py +219 -121
datahub/ingestion/source/state/checkpoint.py +8 -29
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
datahub/ingestion/source/superset.py +810 -126
datahub/ingestion/source/tableau/tableau.py +172 -69
datahub/ingestion/source/tableau/tableau_common.py +11 -4
datahub/ingestion/source/tableau/tableau_constant.py +1 -4
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/config.py +161 -40
datahub/ingestion/source/unity/connection.py +61 -0
datahub/ingestion/source/unity/connection_test.py +1 -0
datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
datahub/ingestion/source/unity/proxy.py +794 -51
datahub/ingestion/source/unity/proxy_patch.py +321 -0
datahub/ingestion/source/unity/proxy_types.py +36 -2
datahub/ingestion/source/unity/report.py +15 -3
datahub/ingestion/source/unity/source.py +465 -131
datahub/ingestion/source/unity/tag_entities.py +197 -0
datahub/ingestion/source/unity/usage.py +46 -4
datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
datahub/ingestion/source/usage/usage_common.py +4 -68
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1367 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/source_report/ingestion_stage.py +50 -11
datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
datahub/ingestion/transformer/base_transformer.py +8 -5
datahub/ingestion/transformer/dataset_domain.py +1 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/integrations/assertion/common.py +3 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/lite/lite_util.py +2 -2
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
datahub/metadata/_urns/urn_defs.py +1866 -1582
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
datahub/metadata/schema.avsc +18404 -16617
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/Actors.avsc +38 -1
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/AssetSettings.avsc +63 -0
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataContractKey.avsc +2 -1
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +3 -1
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/IncidentInfo.avsc +3 -3
datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
datahub/metadata/schemas/LogicalParent.avsc +145 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/Operation.avsc +21 -2
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/QueryProperties.avsc +24 -2
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
datahub/metadata/schemas/SystemMetadata.avsc +147 -0
datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +7 -0
datahub/sdk/_all_entities.py +15 -0
datahub/sdk/_shared.py +393 -10
datahub/sdk/_utils.py +4 -0
datahub/sdk/chart.py +386 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +453 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +367 -0
datahub/sdk/dataset.py +180 -4
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +154 -12
datahub/sdk/lineage_client.py +943 -0
datahub/sdk/main_client.py +83 -8
datahub/sdk/mlmodel.py +383 -0
datahub/sdk/mlmodelgroup.py +240 -0
datahub/sdk/search_client.py +85 -8
datahub/sdk/search_filters.py +393 -68
datahub/secret/datahub_secret_store.py +5 -1
datahub/secret/environment_secret_store.py +29 -0
datahub/secret/file_secret_store.py +49 -0
datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/aspect_helpers/structured_properties.py +27 -0
datahub/specific/chart.py +1 -1
datahub/specific/datajob.py +15 -1
datahub/specific/dataproduct.py +4 -0
datahub/specific/dataset.py +51 -59
datahub/sql_parsing/_sqlglot_patch.py +1 -2
datahub/sql_parsing/fingerprint_utils.py +6 -0
datahub/sql_parsing/split_statements.py +30 -3
datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
datahub/sql_parsing/sqlglot_lineage.py +517 -44
datahub/sql_parsing/sqlglot_utils.py +30 -18
datahub/sql_parsing/tool_meta_extractor.py +25 -2
datahub/telemetry/telemetry.py +30 -16
datahub/testing/check_imports.py +1 -1
datahub/testing/docker_utils.py +8 -2
datahub/testing/mce_helpers.py +421 -0
datahub/testing/mcp_diff.py +17 -21
datahub/testing/sdk_v2_helpers.py +18 -0
datahub/upgrade/upgrade.py +86 -30
datahub/utilities/file_backed_collections.py +14 -15
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/ingest_utils.py +2 -2
datahub/utilities/is_pytest.py +3 -2
datahub/utilities/logging_manager.py +30 -7
datahub/utilities/mapping.py +29 -2
datahub/utilities/sample_data.py +5 -4
datahub/utilities/server_config_util.py +298 -10
datahub/utilities/sqlalchemy_query_combiner.py +6 -4
datahub/utilities/stats_collections.py +4 -0
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/utilities/urn_encoder.py +1 -1
datahub/utilities/urns/urn.py +41 -2
datahub/emitter/sql_parsing_builder.py +0 -306
datahub/ingestion/source/redshift/lineage_v2.py +0 -458
datahub/ingestion/source/vertexai.py +0 -697
datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0

datahub/sql_parsing/sqlglot_lineage.py CHANGED Viewed

@@ -5,7 +5,18 @@ import functools
 import logging
 import traceback
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
+from typing import (
+    AbstractSet,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
 import pydantic.dataclasses
 import sqlglot
@@ -45,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
     QueryTypeProps,
 )
 from datahub.sql_parsing.sqlglot_utils import (
+    DialectOrStr,
     get_dialect,
     get_query_fingerprint_debug,
     is_dialect_instance,
@@ -54,6 +66,7 @@ from datahub.utilities.cooperative_timeout import (
     CooperativeTimeoutError,
     cooperative_timeout,
 )
+from datahub.utilities.ordered_set import OrderedSet
 assert SQLGLOT_PATCHED
@@ -112,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
 class DownstreamColumnRef(_ParserBaseModel):
+    """
+    TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
+          What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
+          auto-generated class from .pdl model files. We need generic solution allowing us to either:
+          1. Implement hashing for .pdl model objects
+          2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
+             hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
+             to understand that instruction as well.
+          Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
+    """
     table: Optional[Urn] = None
     column: str
     column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -127,20 +151,53 @@ class DownstreamColumnRef(_ParserBaseModel):
             return v
         return SchemaFieldDataTypeClass.from_obj(v)
+    def __hash__(self) -> int:
+        return hash((self.table, self.column, self.native_column_type))
+class ColumnTransformation(_FrozenModel):
+    is_direct_copy: bool
+    column_logic: str
 class _ColumnLineageInfo(_ParserBaseModel):
     downstream: _DownstreamColumnRef
     upstreams: List[_ColumnRef]
-    logic: Optional[str] = None
+    logic: Optional[ColumnTransformation] = None
 class ColumnLineageInfo(_ParserBaseModel):
+    """
+    TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
+          To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
+          depending on it.
+          Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
+    """
     downstream: DownstreamColumnRef
     upstreams: List[ColumnRef]
-    # Logic for this column, as a SQL expression.
-    logic: Optional[str] = pydantic.Field(default=None, exclude=True)
+    logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
+    def __hash__(self) -> int:
+        return hash((self.downstream, tuple(self.upstreams), self.logic))
+class _JoinInfo(_ParserBaseModel):
+    join_type: str
+    left_tables: List[_TableName]
+    right_tables: List[_TableName]
+    on_clause: Optional[str]
+    columns_involved: List[_ColumnRef]
+class JoinInfo(_ParserBaseModel):
+    join_type: str
+    left_tables: List[Urn]
+    right_tables: List[Urn]
+    on_clause: Optional[str]
+    columns_involved: List[ColumnRef]
 class SqlParsingDebugInfo(_ParserBaseModel):
@@ -178,6 +235,7 @@ class SqlParsingResult(_ParserBaseModel):
     out_tables: List[Urn]
     column_lineage: Optional[List[ColumnLineageInfo]] = None
+    joins: Optional[List[JoinInfo]] = None
     # TODO include formatted original sql logic
     # TODO include list of referenced columns
@@ -197,13 +255,19 @@ class SqlParsingResult(_ParserBaseModel):
         )
+def _extract_table_names(
+    iterable: Iterable[sqlglot.exp.Table],
+) -> OrderedSet[_TableName]:
+    return OrderedSet(_TableName.from_sqlglot_table(table) for table in iterable)
 def _table_level_lineage(
     statement: sqlglot.Expression, dialect: sqlglot.Dialect
-) -> Tuple[Set[_TableName], Set[_TableName]]:
+) -> Tuple[AbstractSet[_TableName], AbstractSet[_TableName]]:
     # Generate table-level lineage.
     modified = (
-        {
-            _TableName.from_sqlglot_table(expr.this)
+        _extract_table_names(
+            expr.this
             for expr in statement.find_all(
                 sqlglot.exp.Create,
                 sqlglot.exp.Insert,
@@ -215,36 +279,36 @@ def _table_level_lineage(
             # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)",
             # the `this` on the INSERT part isn't a table.
             if isinstance(expr.this, sqlglot.exp.Table)
-        }
-        | {
+        )
+        | _extract_table_names(
             # For statements that include a column list, like
             # CREATE DDL statements and `INSERT INTO table (col1, col2) SELECT ...`
             # the table name is nested inside a Schema object.
-            _TableName.from_sqlglot_table(expr.this.this)
+            expr.this.this
             for expr in statement.find_all(
                 sqlglot.exp.Create,
                 sqlglot.exp.Insert,
             )
             if isinstance(expr.this, sqlglot.exp.Schema)
             and isinstance(expr.this.this, sqlglot.exp.Table)
-        }
-        | {
+        )
+        | _extract_table_names(
             # For drop statements, we only want it if a table/view is being dropped.
             # Other "kinds" will not have table.name populated.
-            _TableName.from_sqlglot_table(expr.this)
+            expr.this
             for expr in ([statement] if isinstance(statement, sqlglot.exp.Drop) else [])
             if isinstance(expr.this, sqlglot.exp.Table)
             and expr.this.this
             and expr.this.name
-        }
+        )
     )
     tables = (
-        {
-            _TableName.from_sqlglot_table(table)
+        _extract_table_names(
+            table
             for table in statement.find_all(sqlglot.exp.Table)
             if not isinstance(table.parent, sqlglot.exp.Drop)
-        }
+        )
         # ignore references created in this query
         - modified
         # ignore CTEs created in this statement
@@ -479,16 +543,19 @@ def _select_statement_cll(
     root_scope: sqlglot.optimizer.Scope,
     column_resolver: _ColumnResolver,
     output_table: Optional[_TableName],
+    table_name_schema_mapping: Dict[_TableName, SchemaInfo],
+    default_db: Optional[str] = None,
+    default_schema: Optional[str] = None,
 ) -> List[_ColumnLineageInfo]:
     column_lineage: List[_ColumnLineageInfo] = []
     try:
-        # List output columns.
         output_columns = [
             (select_col.alias_or_name, select_col) for select_col in statement.selects
         ]
         logger.debug("output columns: %s", [col[0] for col in output_columns])
-        for output_col, original_col_expression in output_columns:
+        for output_col, _original_col_expression in output_columns:
             if not output_col or output_col == "*":
                 # If schema information is available, the * will be expanded to the actual columns.
                 # Otherwise, we can't process it.
@@ -512,15 +579,14 @@ def _select_statement_cll(
                 trim_selects=False,
                 # We don't need to pass the schema in here, since we've already qualified the columns.
             )
-            # import pathlib
-            # pathlib.Path("sqlglot.html").write_text(
-            #     str(lineage_node.to_html(dialect=dialect))
-            # )
             # Generate SELECT lineage.
-            direct_raw_col_upstreams = _get_direct_raw_col_upstreams(lineage_node)
-            # column_logic = lineage_node.source
+            direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
+                lineage_node,
+                dialect,
+                default_db,
+                default_schema,
+            )
             # Fuzzy resolve the output column.
             original_col_expression = lineage_node.expression
@@ -539,7 +605,7 @@ def _select_statement_cll(
             if original_col_expression.type:
                 output_col_type = original_col_expression.type
-            # Fuzzy resolve upstream columns.
+            # Resolve upstream columns - table names should already be qualified from placeholder processing
             direct_resolved_col_upstreams = {
                 _ColumnRef(
                     table=edge.table,
@@ -560,7 +626,7 @@ def _select_statement_cll(
                         column_type=output_col_type,
                     ),
                     upstreams=sorted(direct_resolved_col_upstreams),
-                    # logic=column_logic.sql(pretty=True, dialect=dialect),
+                    logic=_get_column_transformation(lineage_node, dialect),
                 )
             )
@@ -575,6 +641,7 @@ def _select_statement_cll(
 class _ColumnLineageWithDebugInfo(_ParserBaseModel):
     column_lineage: List[_ColumnLineageInfo]
+    joins: Optional[List[_JoinInfo]] = None
     select_statement: Optional[sqlglot.exp.Expression] = None
     # TODO: Add column exceptions here.
@@ -624,6 +691,13 @@ def _column_level_lineage(
             select_statement=select_statement,
         )
+    # Handle VALUES expressions separately - they have no upstream tables and no column lineage
+    if isinstance(select_statement, sqlglot.exp.Values):
+        return _ColumnLineageWithDebugInfo(
+            column_lineage=[],
+            select_statement=select_statement,
+        )
     assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
     try:
         root_scope = sqlglot.optimizer.build_scope(select_statement)
@@ -643,19 +717,35 @@ def _column_level_lineage(
         root_scope=root_scope,
         column_resolver=column_resolver,
         output_table=downstream_table,
+        table_name_schema_mapping=table_name_schema_mapping,
+        default_db=default_db,
+        default_schema=default_schema,
     )
+    joins: Optional[List[_JoinInfo]] = None
+    try:
+        # List join clauses.
+        joins = _list_joins(dialect=dialect, root_scope=root_scope)
+        logger.debug("Joins: %s", joins)
+    except Exception as e:
+        # This is a non-fatal error, so we can continue.
+        logger.debug("Failed to list joins: %s", e)
     return _ColumnLineageWithDebugInfo(
         column_lineage=column_lineage,
+        joins=joins,
         select_statement=select_statement,
     )
 def _get_direct_raw_col_upstreams(
     lineage_node: sqlglot.lineage.Node,
-) -> Set[_ColumnRef]:
-    # Using a set here to deduplicate upstreams.
-    direct_raw_col_upstreams: Set[_ColumnRef] = set()
+    dialect: Optional[sqlglot.Dialect] = None,
+    default_db: Optional[str] = None,
+    default_schema: Optional[str] = None,
+) -> OrderedSet[_ColumnRef]:
+    # Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
+    direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
     for node in lineage_node.walk():
         if node.downstream:
@@ -682,6 +772,53 @@ def _get_direct_raw_col_upstreams(
             direct_raw_col_upstreams.add(
                 _ColumnRef(table=table_ref, column=normalized_col)
             )
+        elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
+            # Handle placeholder expressions from lateral joins.
+            #
+            # In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
+            # expressions instead of regular table references. This is critical for lateral join column lineage.
+            #
+            # Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
+            # The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
+            # which we need to parse to establish the lineage: output.value <- my_table2.value
+            #
+            # Without this handling, lateral join column lineage would be incomplete/missing.
+            try:
+                parsed = sqlglot.parse_one(node.name, dialect=dialect)
+                if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
+                    table_ref = _TableName.from_sqlglot_table(
+                        sqlglot.parse_one(
+                            parsed.table, into=sqlglot.exp.Table, dialect=dialect
+                        )
+                    )
+                    # SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
+                    # Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
+                    # We need this runtime qualification to ensure proper lineage resolution.
+                    # Only qualify if this appears to be a real table reference (not a temporary construct)
+                    if (
+                        not (table_ref.database or table_ref.db_schema)
+                        and dialect is not None
+                    ):
+                        table_ref = table_ref.qualified(
+                            dialect=dialect,
+                            default_db=default_db,
+                            default_schema=default_schema,
+                        )
+                    # Extract column name using proper isinstance check
+                    if isinstance(parsed.this, sqlglot.exp.Identifier):
+                        column_name = parsed.this.name
+                    else:
+                        column_name = str(parsed.this)
+                    direct_raw_col_upstreams.add(
+                        _ColumnRef(table=table_ref, column=column_name)
+                    )
+            except Exception as e:
+                logger.debug(
+                    f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
+                    exc_info=True,
+                )
         else:
             # This branch doesn't matter. For example, a count(*) column would go here, and
             # we don't get any column-level lineage for that.
@@ -690,6 +827,268 @@ def _get_direct_raw_col_upstreams(
     return direct_raw_col_upstreams
+def _is_single_column_expression(
+    expression: sqlglot.exp.Expression,
+) -> bool:
+    # Check if the expression is trivial, i.e. it's just a single column.
+    # Things like count(*) or coalesce(col, 0) are not single columns.
+    if isinstance(expression, sqlglot.exp.Alias):
+        expression = expression.this
+    return isinstance(expression, sqlglot.exp.Column)
+def _get_column_transformation(
+    lineage_node: sqlglot.lineage.Node,
+    dialect: sqlglot.Dialect,
+    parent: Optional[sqlglot.lineage.Node] = None,
+) -> ColumnTransformation:
+    # expression = lineage_node.expression
+    # is_single_column_expression = _is_single_column_expression(lineage_node.expression)
+    if not lineage_node.downstream:
+        # parent_expression = parent.expression if parent else expression
+        if parent:
+            expression = parent.expression
+            is_copy = _is_single_column_expression(expression)
+        else:
+            # This case should rarely happen.
+            is_copy = True
+            expression = lineage_node.expression
+        return ColumnTransformation(
+            is_direct_copy=is_copy,
+            column_logic=expression.sql(dialect=dialect),
+        )
+    elif len(lineage_node.downstream) > 1 or not _is_single_column_expression(
+        lineage_node.expression
+    ):
+        return ColumnTransformation(
+            is_direct_copy=False,
+            column_logic=lineage_node.expression.sql(dialect=dialect),
+        )
+    else:
+        return _get_column_transformation(
+            lineage_node=lineage_node.downstream[0],
+            dialect=dialect,
+            parent=lineage_node,
+        )
+def _get_join_side_tables(
+    target: sqlglot.exp.Expression,
+    dialect: sqlglot.Dialect,
+    scope: sqlglot.optimizer.Scope,
+) -> OrderedSet[_TableName]:
+    target_alias_or_name = target.alias_or_name
+    if (source := scope.sources.get(target_alias_or_name)) and isinstance(
+        source, sqlglot.exp.Table
+    ):
+        # If the source is a Scope, we need to do some resolution work.
+        return OrderedSet([_TableName.from_sqlglot_table(source)])
+    column = sqlglot.exp.Column(
+        this=sqlglot.exp.Star(),
+        table=sqlglot.exp.Identifier(this=target.alias_or_name),
+    )
+    columns_used = _get_raw_col_upstreams_for_expression(
+        select=column,
+        dialect=dialect,
+        scope=scope,
+    )
+    return OrderedSet(col.table for col in columns_used)
+def _get_raw_col_upstreams_for_expression(
+    select: sqlglot.exp.Expression,
+    dialect: sqlglot.Dialect,
+    scope: sqlglot.optimizer.Scope,
+) -> OrderedSet[_ColumnRef]:
+    if not isinstance(scope.expression, sqlglot.exp.Query):
+        # Note that Select, Subquery, SetOperation, etc. are all subclasses of Query.
+        # So this line should basically never happen.
+        return OrderedSet()
+    original_expression = scope.expression
+    updated_expression = scope.expression.select(select, append=False, copy=True)
+    try:
+        scope.expression = updated_expression
+        node = sqlglot.lineage.to_node(
+            column=0,
+            scope=scope,
+            dialect=dialect,
+            trim_selects=False,
+        )
+        return _get_direct_raw_col_upstreams(node, dialect, None, None)
+    finally:
+        scope.expression = original_expression
+def _list_joins(
+    dialect: sqlglot.Dialect,
+    root_scope: sqlglot.optimizer.Scope,
+) -> List[_JoinInfo]:
+    # TODO: Add a confidence tracker here.
+    joins: List[_JoinInfo] = []
+    scope: sqlglot.optimizer.Scope
+    for scope in root_scope.traverse():
+        # PART 1: Handle regular explicit JOINs (updated API)
+        join: sqlglot.exp.Join
+        for join in scope.expression.find_all(sqlglot.exp.Join):
+            left_side_tables: OrderedSet[_TableName] = OrderedSet()
+            from_clause: sqlglot.exp.From
+            for from_clause in scope.find_all(sqlglot.exp.From):
+                left_side_tables.update(
+                    _get_join_side_tables(
+                        target=from_clause.this,
+                        dialect=dialect,
+                        scope=scope,
+                    )
+                )
+            right_side_tables: OrderedSet[_TableName] = OrderedSet()
+            if join_target := join.this:
+                right_side_tables = _get_join_side_tables(
+                    target=join_target,
+                    dialect=dialect,
+                    scope=scope,
+                )
+            # We don't need to check for `using` here because it's normalized to `on`
+            # by the sqlglot optimizer.
+            on_clause: Optional[sqlglot.exp.Expression] = join.args.get("on")
+            if on_clause:
+                joined_columns = _get_raw_col_upstreams_for_expression(
+                    select=on_clause, dialect=dialect, scope=scope
+                )
+                unique_tables = OrderedSet(col.table for col in joined_columns)
+                if not unique_tables:
+                    logger.debug(
+                        "Skipping join because we couldn't resolve the tables from the join condition: %s",
+                        join.sql(dialect=dialect),
+                    )
+                    continue
+                # When we have an `on` clause, we only want to include tables whose columns are
+                # involved in the join condition. Without this, a statement like this:
+                #   WITH cte_alias AS (select t1.id, t1.user_id, t2.other_col from t1 join t2 on t1.id = t2.id)
+                #   SELECT * FROM users
+                #   JOIN cte_alias ON users.id = cte_alias.user_id
+                # would incorrectly include t2 as part of the left side tables.
+                left_side_tables = OrderedSet(left_side_tables & unique_tables)
+                right_side_tables = OrderedSet(right_side_tables & unique_tables)
+            else:
+                # Some joins (cross join, lateral join, etc.) don't have an ON clause.
+                # In those cases, we have some best-effort logic at least extract the
+                # tables involved.
+                joined_columns = OrderedSet()
+                if not left_side_tables and not right_side_tables:
+                    logger.debug(
+                        "Skipping join because we couldn't resolve any tables from the join operands: %s",
+                        join.sql(dialect=dialect),
+                    )
+                    continue
+                elif len(left_side_tables | right_side_tables) == 1:
+                    # When we don't have an ON clause, we're more strict about the
+                    # minimum number of tables we need to resolve to avoid false positives.
+                    # On the off chance someone is doing a self-cross-join, we'll miss it.
+                    logger.debug(
+                        "Skipping join because we couldn't resolve enough tables from the join operands: %s",
+                        join.sql(dialect=dialect),
+                    )
+                    continue
+            joins.append(
+                _JoinInfo(
+                    join_type=_get_join_type(join),
+                    left_tables=list(left_side_tables),
+                    right_tables=list(right_side_tables),
+                    on_clause=on_clause.sql(dialect=dialect) if on_clause else None,
+                    columns_involved=list(sorted(joined_columns)),
+                )
+            )
+        # Handle LATERAL constructs
+        for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
+            # Get tables from non-lateral FROM clauses
+            qualified_left: OrderedSet[_TableName] = OrderedSet()
+            for from_clause in scope.find_all(sqlglot.exp.From):
+                if not isinstance(from_clause.this, sqlglot.exp.Lateral):
+                    qualified_left.update(
+                        _get_join_side_tables(from_clause.this, dialect, scope)
+                    )
+            # Get tables from lateral subquery
+            qualified_right: OrderedSet[_TableName] = OrderedSet()
+            if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
+                qualified_right.update(
+                    _TableName.from_sqlglot_table(t)
+                    for t in lateral.this.find_all(sqlglot.exp.Table)
+                )
+            qualified_right.update(qualified_left)
+            if qualified_left and qualified_right:
+                joins.append(
+                    _JoinInfo(
+                        join_type="LATERAL JOIN",
+                        left_tables=list(qualified_left),
+                        right_tables=list(qualified_right),
+                        on_clause=None,
+                        columns_involved=[],
+                    )
+                )
+    return joins
+def _get_join_type(join: sqlglot.exp.Join) -> str:
+    """Returns the type of join as a string.
+    Args:
+        join: A sqlglot Join expression.
+    Returns:
+        Stringified join type e.g. "LEFT JOIN", "RIGHT OUTER JOIN", "LATERAL JOIN", etc.
+    """
+    # This logic was derived from the sqlglot join_sql method.
+    # https://github.com/tobymao/sqlglot/blob/07bf71bae5d2a5c381104a86bb52c06809c21174/sqlglot/generator.py#L2248
+    # Special case for lateral joins
+    if isinstance(join.this, sqlglot.exp.Lateral):
+        if join.this.args.get("cross_apply") is not None:
+            return "CROSS APPLY"
+        return "LATERAL JOIN"
+    # Special case for STRAIGHT_JOIN (MySQL)
+    if join.args.get("kind") == "STRAIGHT":
+        return "STRAIGHT_JOIN"
+    # <method> <global> <side> <kind> JOIN
+    #  - method = "HASH", "MERGE"
+    #  - global = "GLOBAL"
+    #  - side = "LEFT", "RIGHT"
+    #  - kind = "INNER", "OUTER", "SEMI", "ANTI"
+    components = []
+    if method := join.args.get("method"):
+        components.append(method)
+    if join.args.get("global"):
+        components.append("GLOBAL")
+    if side := join.args.get("side"):
+        # For SEMI/ANTI joins, side is optional
+        components.append(side)
+    if kind := join.args.get("kind"):
+        components.append(kind)
+    components.append("JOIN")
+    return " ".join(components)
 def _extract_select_from_create(
     statement: sqlglot.exp.Create,
 ) -> sqlglot.exp.Expression:
@@ -784,7 +1183,12 @@ def _try_extract_select(
             statement = sqlglot.exp.Select().select("*").from_(statement)
     elif isinstance(statement, sqlglot.exp.Insert):
         # TODO Need to map column renames in the expressions part of the statement.
-        statement = statement.expression
+        # Preserve CTEs when extracting the SELECT expression from INSERT
+        original_ctes = statement.ctes
+        statement = statement.expression  # Get the SELECT expression from the INSERT
+        if isinstance(statement, sqlglot.exp.Query) and original_ctes:
+            for cte in original_ctes:
+                statement = statement.with_(alias=cte.alias, as_=cte.this)
     elif isinstance(statement, sqlglot.exp.Update):
         # Assumption: the output table is already captured in the modified tables list.
         statement = _extract_select_from_update(statement)
@@ -875,6 +1279,40 @@ def _translate_internal_column_lineage(
     )
+def _translate_internal_joins(
+    table_name_urn_mapping: Dict[_TableName, str],
+    raw_joins: List[_JoinInfo],
+    dialect: sqlglot.Dialect,
+) -> List[JoinInfo]:
+    joins = []
+    for raw_join in raw_joins:
+        try:
+            joins.append(
+                JoinInfo(
+                    join_type=raw_join.join_type,
+                    left_tables=[
+                        table_name_urn_mapping[table] for table in raw_join.left_tables
+                    ],
+                    right_tables=[
+                        table_name_urn_mapping[table] for table in raw_join.right_tables
+                    ],
+                    on_clause=raw_join.on_clause,
+                    columns_involved=[
+                        ColumnRef(
+                            table=table_name_urn_mapping[col.table],
+                            column=col.column,
+                        )
+                        for col in raw_join.columns_involved
+                    ],
+                )
+            )
+        except KeyError as e:
+            # Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
+            logger.debug(f"Skipping join with unresolvable table: {e}")
+            continue
+    return joins
 _StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
@@ -923,12 +1361,12 @@ def _sqlglot_lineage_inner(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
-    if not default_dialect:
-        dialect = get_dialect(schema_resolver.platform)
+    if override_dialect:
+        dialect = get_dialect(override_dialect)
     else:
-        dialect = get_dialect(default_dialect)
+        dialect = get_dialect(schema_resolver.platform)
     default_db = _normalize_db_or_schema(default_db, dialect)
     default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1034,6 +1472,7 @@ def _sqlglot_lineage_inner(
             )
     column_lineage: Optional[List[_ColumnLineageInfo]] = None
+    joins = None
     try:
         with cooperative_timeout(
             timeout=(
@@ -1049,6 +1488,7 @@ def _sqlglot_lineage_inner(
                 default_schema=default_schema,
             )
             column_lineage = column_lineage_debug_info.column_lineage
+            joins = column_lineage_debug_info.joins
     except CooperativeTimeoutError as e:
         logger.debug(f"Timed out while generating column-level lineage: {e}")
         debug_info.column_error = e
@@ -1081,6 +1521,14 @@ def _sqlglot_lineage_inner(
                 f"Failed to translate column lineage to urns: {e}", exc_info=True
             )
             debug_info.column_error = e
+    joins_urns = None
+    if joins is not None:
+        try:
+            joins_urns = _translate_internal_joins(
+                table_name_urn_mapping, raw_joins=joins, dialect=dialect
+            )
+        except KeyError as e:
+            logger.debug(f"Failed to translate joins to urns: {e}", exc_info=True)
     query_type, query_type_props = get_query_type_of_sql(
         original_statement, dialect=dialect
@@ -1095,6 +1543,7 @@ def _sqlglot_lineage_inner(
         in_tables=in_urns,
         out_tables=out_urns,
         column_lineage=column_lineage_urns,
+        joins=joins_urns,
         debug_info=debug_info,
     )
@@ -1104,7 +1553,7 @@ def _sqlglot_lineage_nocache(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     """Parse a SQL statement and generate lineage information.
@@ -1122,8 +1571,8 @@ def _sqlglot_lineage_nocache(
     can be brittle with respect to missing schema information and complex
     SQL logic like UNNESTs.
-    The SQL dialect can be given as an argument called default_dialect or it can
-    be inferred from the schema_resolver's platform.
+    The SQL dialect will be inferred from the schema_resolver's platform.
+    That inference can be overridden by passing an override_dialect argument.
     The set of supported dialects is the same as sqlglot's. See their
     `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
     for the full list.
@@ -1138,7 +1587,7 @@ def _sqlglot_lineage_nocache(
         schema_resolver: The schema resolver to use for resolving table schemas.
         default_db: The default database to use for unqualified table names.
         default_schema: The default schema to use for unqualified table names.
-        default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
+        override_dialect: Override the dialect provided by 'schema_resolver'.
     Returns:
         A SqlParsingResult object containing the parsed lineage information.
@@ -1163,10 +1612,32 @@ def _sqlglot_lineage_nocache(
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
-            default_dialect=default_dialect,
+            override_dialect=override_dialect,
         )
     except Exception as e:
         return SqlParsingResult.make_from_error(e)
+    except BaseException as e:
+        # Check if this is a PanicException from SQLGlot's Rust tokenizer
+        # We use runtime type checking instead of isinstance() because pyo3_runtime
+        # is only available when sqlglot[rs] is installed and may not be importable
+        # at module load time, but the exception can still be raised at runtime
+        if (
+            e.__class__.__name__ == "PanicException"
+            and e.__class__.__module__ == "pyo3_runtime"
+        ):
+            # Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
+            # pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
+            # KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
+            # Avoid catching BaseException, as it includes KeyboardInterrupt
+            # and would prevent Ctrl+C from working.
+            wrapped_exception = Exception(
+                f"pyo3_runtime.PanicException during SQL parsing: {e}"
+            )
+            wrapped_exception.__cause__ = e
+            return SqlParsingResult.make_from_error(wrapped_exception)
+        else:
+            # Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
+            raise
 _sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
@@ -1179,15 +1650,15 @@ def sqlglot_lineage(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     if schema_resolver.includes_temp_tables():
         return _sqlglot_lineage_nocache(
-            sql, schema_resolver, default_db, default_schema, default_dialect
+            sql, schema_resolver, default_db, default_schema, override_dialect
         )
     else:
         return _sqlglot_lineage_cached(
-            sql, schema_resolver, default_db, default_schema, default_dialect
+            sql, schema_resolver, default_db, default_schema, override_dialect
         )
@@ -1239,6 +1710,7 @@ def create_lineage_sql_parsed_result(
     default_schema: Optional[str] = None,
     graph: Optional[DataHubGraph] = None,
     schema_aware: bool = True,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     schema_resolver = create_schema_resolver(
         platform=platform,
@@ -1258,6 +1730,7 @@ def create_lineage_sql_parsed_result(
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
+            override_dialect=override_dialect,
         )
     except Exception as e:
         return SqlParsingResult.make_from_error(e)

acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.3.0.1rc9py3-none-any.whl