acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,6 @@ import enum
|
|
|
4
4
|
import functools
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import os
|
|
8
7
|
import pathlib
|
|
9
8
|
import tempfile
|
|
10
9
|
import uuid
|
|
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
|
|
|
14
13
|
|
|
15
14
|
import datahub.emitter.mce_builder as builder
|
|
16
15
|
import datahub.metadata.schema_classes as models
|
|
16
|
+
from datahub.configuration.env_vars import get_sql_agg_query_log
|
|
17
17
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
18
18
|
from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
|
-
from datahub.emitter.sql_parsing_builder import compute_upstream_fields
|
|
21
20
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
21
|
from datahub.ingestion.api.report import Report
|
|
23
22
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -30,7 +29,9 @@ from datahub.metadata.urns import (
|
|
|
30
29
|
DatasetUrn,
|
|
31
30
|
QueryUrn,
|
|
32
31
|
SchemaFieldUrn,
|
|
32
|
+
Urn,
|
|
33
33
|
)
|
|
34
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
34
35
|
from datahub.sql_parsing.schema_resolver import (
|
|
35
36
|
SchemaResolver,
|
|
36
37
|
SchemaResolverInterface,
|
|
@@ -47,8 +48,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
47
48
|
sqlglot_lineage,
|
|
48
49
|
)
|
|
49
50
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
51
|
+
DialectOrStr,
|
|
50
52
|
_parse_statement,
|
|
51
|
-
generate_hash,
|
|
52
53
|
get_query_fingerprint,
|
|
53
54
|
try_format_query,
|
|
54
55
|
)
|
|
@@ -57,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
|
|
|
57
58
|
ToolMetaExtractorReport,
|
|
58
59
|
)
|
|
59
60
|
from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
|
|
61
|
+
from datahub.utilities.dedup_list import deduplicate_list
|
|
60
62
|
from datahub.utilities.file_backed_collections import (
|
|
61
63
|
ConnectionWrapper,
|
|
62
64
|
FileBackedDict,
|
|
@@ -81,7 +83,7 @@ class QueryLogSetting(enum.Enum):
|
|
|
81
83
|
_DEFAULT_USER_URN = CorpUserUrn("_ingestion")
|
|
82
84
|
_MISSING_SESSION_ID = "__MISSING_SESSION_ID"
|
|
83
85
|
_DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
|
|
84
|
-
|
|
86
|
+
get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
|
|
85
87
|
]
|
|
86
88
|
MAX_UPSTREAM_TABLES_COUNT = 300
|
|
87
89
|
MAX_FINEGRAINEDLINEAGE_COUNT = 2000
|
|
@@ -107,8 +109,9 @@ class ObservedQuery:
|
|
|
107
109
|
default_schema: Optional[str] = None
|
|
108
110
|
query_hash: Optional[str] = None
|
|
109
111
|
usage_multiplier: int = 1
|
|
112
|
+
override_dialect: Optional[DialectOrStr] = None
|
|
110
113
|
|
|
111
|
-
# Use this to store
|
|
114
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
112
115
|
extra_info: Optional[dict] = None
|
|
113
116
|
|
|
114
117
|
|
|
@@ -139,6 +142,9 @@ class QueryMetadata:
|
|
|
139
142
|
|
|
140
143
|
used_temp_tables: bool = True
|
|
141
144
|
|
|
145
|
+
extra_info: Optional[dict] = None
|
|
146
|
+
origin: Optional[Urn] = None
|
|
147
|
+
|
|
142
148
|
def make_created_audit_stamp(self) -> models.AuditStampClass:
|
|
143
149
|
return models.AuditStampClass(
|
|
144
150
|
time=make_ts_millis(self.latest_timestamp) or 0,
|
|
@@ -152,6 +158,48 @@ class QueryMetadata:
|
|
|
152
158
|
actor=(self.actor or _DEFAULT_USER_URN).urn(),
|
|
153
159
|
)
|
|
154
160
|
|
|
161
|
+
def get_subjects(
|
|
162
|
+
self,
|
|
163
|
+
downstream_urn: Optional[str],
|
|
164
|
+
include_fields: bool,
|
|
165
|
+
) -> List[UrnStr]:
|
|
166
|
+
query_subject_urns = OrderedSet[UrnStr]()
|
|
167
|
+
for upstream in self.upstreams:
|
|
168
|
+
query_subject_urns.add(upstream)
|
|
169
|
+
if include_fields:
|
|
170
|
+
for column in sorted(self.column_usage.get(upstream, [])):
|
|
171
|
+
query_subject_urns.add(
|
|
172
|
+
builder.make_schema_field_urn(upstream, column)
|
|
173
|
+
)
|
|
174
|
+
if downstream_urn:
|
|
175
|
+
query_subject_urns.add(downstream_urn)
|
|
176
|
+
if include_fields:
|
|
177
|
+
for column_lineage in self.column_lineage:
|
|
178
|
+
query_subject_urns.add(
|
|
179
|
+
builder.make_schema_field_urn(
|
|
180
|
+
downstream_urn, column_lineage.downstream.column
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
return list(query_subject_urns)
|
|
184
|
+
|
|
185
|
+
def make_query_properties(self) -> models.QueryPropertiesClass:
|
|
186
|
+
return models.QueryPropertiesClass(
|
|
187
|
+
statement=models.QueryStatementClass(
|
|
188
|
+
value=self.formatted_query_string,
|
|
189
|
+
language=models.QueryLanguageClass.SQL,
|
|
190
|
+
),
|
|
191
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
192
|
+
created=self.make_created_audit_stamp(),
|
|
193
|
+
lastModified=self.make_last_modified_audit_stamp(),
|
|
194
|
+
origin=self.origin.urn() if self.origin else None,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
|
|
199
|
+
return models.QuerySubjectsClass(
|
|
200
|
+
subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
|
|
201
|
+
)
|
|
202
|
+
|
|
155
203
|
|
|
156
204
|
@dataclasses.dataclass
|
|
157
205
|
class KnownQueryLineageInfo:
|
|
@@ -219,8 +267,9 @@ class PreparsedQuery:
|
|
|
219
267
|
query_type_props: QueryTypeProps = dataclasses.field(
|
|
220
268
|
default_factory=lambda: QueryTypeProps()
|
|
221
269
|
)
|
|
222
|
-
# Use this to store
|
|
270
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
223
271
|
extra_info: Optional[dict] = None
|
|
272
|
+
origin: Optional[Urn] = None
|
|
224
273
|
|
|
225
274
|
|
|
226
275
|
@dataclasses.dataclass
|
|
@@ -584,6 +633,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
584
633
|
TableSwap,
|
|
585
634
|
],
|
|
586
635
|
) -> None:
|
|
636
|
+
"""
|
|
637
|
+
This assumes that queries come in order of increasing timestamps.
|
|
638
|
+
"""
|
|
587
639
|
if isinstance(item, KnownQueryLineageInfo):
|
|
588
640
|
self.add_known_query_lineage(item)
|
|
589
641
|
elif isinstance(item, KnownLineageMapping):
|
|
@@ -786,6 +838,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
786
838
|
session_id=session_id,
|
|
787
839
|
timestamp=observed.timestamp,
|
|
788
840
|
user=observed.user,
|
|
841
|
+
override_dialect=observed.override_dialect,
|
|
789
842
|
)
|
|
790
843
|
if parsed.debug_info.error:
|
|
791
844
|
self.report.observed_query_parse_failures.append(
|
|
@@ -814,7 +867,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
814
867
|
downstream=parsed.out_tables[0] if parsed.out_tables else None,
|
|
815
868
|
column_lineage=parsed.column_lineage,
|
|
816
869
|
# TODO: We need a full list of columns referenced, not just the out tables.
|
|
817
|
-
column_usage=
|
|
870
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
818
871
|
inferred_schema=infer_output_schema(parsed),
|
|
819
872
|
confidence_score=parsed.debug_info.confidence,
|
|
820
873
|
extra_info=observed.extra_info,
|
|
@@ -903,6 +956,8 @@ class SqlParsingAggregator(Closeable):
|
|
|
903
956
|
column_usage=parsed.column_usage or {},
|
|
904
957
|
confidence_score=parsed.confidence_score,
|
|
905
958
|
used_temp_tables=session_has_temp_tables,
|
|
959
|
+
extra_info=parsed.extra_info,
|
|
960
|
+
origin=parsed.origin,
|
|
906
961
|
)
|
|
907
962
|
)
|
|
908
963
|
|
|
@@ -1101,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1101
1156
|
actor=None,
|
|
1102
1157
|
upstreams=parsed.in_tables,
|
|
1103
1158
|
column_lineage=parsed.column_lineage or [],
|
|
1104
|
-
column_usage=
|
|
1159
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
1105
1160
|
confidence_score=parsed.debug_info.confidence,
|
|
1106
1161
|
)
|
|
1107
1162
|
)
|
|
@@ -1118,6 +1173,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1118
1173
|
session_id: str = _MISSING_SESSION_ID,
|
|
1119
1174
|
timestamp: Optional[datetime] = None,
|
|
1120
1175
|
user: Optional[Union[CorpUserUrn, CorpGroupUrn]] = None,
|
|
1176
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1121
1177
|
) -> SqlParsingResult:
|
|
1122
1178
|
with self.report.sql_parsing_timer:
|
|
1123
1179
|
parsed = sqlglot_lineage(
|
|
@@ -1125,6 +1181,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1125
1181
|
schema_resolver=schema_resolver,
|
|
1126
1182
|
default_db=default_db,
|
|
1127
1183
|
default_schema=default_schema,
|
|
1184
|
+
override_dialect=override_dialect,
|
|
1128
1185
|
)
|
|
1129
1186
|
self.report.num_sql_parsed += 1
|
|
1130
1187
|
|
|
@@ -1283,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
|
|
|
1283
1340
|
upstreams.setdefault(upstream, query.query_id)
|
|
1284
1341
|
|
|
1285
1342
|
for lineage_info in query.column_lineage:
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1343
|
+
if (
|
|
1344
|
+
not lineage_info.downstream.column
|
|
1345
|
+
or not lineage_info.downstream.column.strip()
|
|
1346
|
+
):
|
|
1347
|
+
logger.debug(
|
|
1348
|
+
f"Skipping lineage entry with empty downstream column in query {query.query_id}"
|
|
1290
1349
|
)
|
|
1350
|
+
continue
|
|
1351
|
+
|
|
1352
|
+
for upstream_ref in lineage_info.upstreams:
|
|
1353
|
+
if upstream_ref.column and upstream_ref.column.strip():
|
|
1354
|
+
cll[lineage_info.downstream.column].setdefault(
|
|
1355
|
+
SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
|
|
1356
|
+
query.query_id,
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
logger.debug(
|
|
1360
|
+
f"Skipping empty column reference in lineage for query {query.query_id}"
|
|
1361
|
+
)
|
|
1291
1362
|
|
|
1292
1363
|
# Finally, we can build our lineage edge.
|
|
1293
1364
|
required_queries = OrderedSet[QueryId]()
|
|
@@ -1320,6 +1391,13 @@ class SqlParsingAggregator(Closeable):
|
|
|
1320
1391
|
):
|
|
1321
1392
|
upstream_columns = [x[0] for x in upstream_columns_for_query]
|
|
1322
1393
|
required_queries.add(query_id)
|
|
1394
|
+
query = queries_map[query_id]
|
|
1395
|
+
|
|
1396
|
+
column_logic = None
|
|
1397
|
+
for lineage_info in query.column_lineage:
|
|
1398
|
+
if lineage_info.downstream.column == downstream_column:
|
|
1399
|
+
column_logic = lineage_info.logic
|
|
1400
|
+
break
|
|
1323
1401
|
|
|
1324
1402
|
upstream_aspect.fineGrainedLineages.append(
|
|
1325
1403
|
models.FineGrainedLineageClass(
|
|
@@ -1337,7 +1415,16 @@ class SqlParsingAggregator(Closeable):
|
|
|
1337
1415
|
if self.can_generate_query(query_id)
|
|
1338
1416
|
else None
|
|
1339
1417
|
),
|
|
1340
|
-
confidenceScore=
|
|
1418
|
+
confidenceScore=query.confidence_score,
|
|
1419
|
+
transformOperation=(
|
|
1420
|
+
(
|
|
1421
|
+
f"COPY: {column_logic.column_logic}"
|
|
1422
|
+
if column_logic.is_direct_copy
|
|
1423
|
+
else f"SQL: {column_logic.column_logic}"
|
|
1424
|
+
)
|
|
1425
|
+
if column_logic
|
|
1426
|
+
else None
|
|
1427
|
+
),
|
|
1341
1428
|
)
|
|
1342
1429
|
)
|
|
1343
1430
|
|
|
@@ -1429,47 +1516,21 @@ class SqlParsingAggregator(Closeable):
|
|
|
1429
1516
|
return
|
|
1430
1517
|
|
|
1431
1518
|
# If a query doesn't involve any allowed tables, skip it.
|
|
1432
|
-
if
|
|
1433
|
-
self.is_allowed_table(
|
|
1434
|
-
):
|
|
1519
|
+
if (
|
|
1520
|
+
downstream_urn is None or not self.is_allowed_table(downstream_urn)
|
|
1521
|
+
) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
|
|
1435
1522
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1436
1523
|
return
|
|
1437
1524
|
|
|
1438
|
-
query_subject_urns = OrderedSet[UrnStr]()
|
|
1439
|
-
for upstream in query.upstreams:
|
|
1440
|
-
query_subject_urns.add(upstream)
|
|
1441
|
-
if self.generate_query_subject_fields:
|
|
1442
|
-
for column in sorted(query.column_usage.get(upstream, [])):
|
|
1443
|
-
query_subject_urns.add(
|
|
1444
|
-
builder.make_schema_field_urn(upstream, column)
|
|
1445
|
-
)
|
|
1446
|
-
if downstream_urn:
|
|
1447
|
-
query_subject_urns.add(downstream_urn)
|
|
1448
|
-
if self.generate_query_subject_fields:
|
|
1449
|
-
for column_lineage in query.column_lineage:
|
|
1450
|
-
query_subject_urns.add(
|
|
1451
|
-
builder.make_schema_field_urn(
|
|
1452
|
-
downstream_urn, column_lineage.downstream.column
|
|
1453
|
-
)
|
|
1454
|
-
)
|
|
1455
|
-
|
|
1456
1525
|
yield from MetadataChangeProposalWrapper.construct_many(
|
|
1457
1526
|
entityUrn=self._query_urn(query_id),
|
|
1458
1527
|
aspects=[
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
created=query.make_created_audit_stamp(),
|
|
1466
|
-
lastModified=query.make_last_modified_audit_stamp(),
|
|
1467
|
-
),
|
|
1468
|
-
models.QuerySubjectsClass(
|
|
1469
|
-
subjects=[
|
|
1470
|
-
models.QuerySubjectClass(entity=urn)
|
|
1471
|
-
for urn in query_subject_urns
|
|
1472
|
-
]
|
|
1528
|
+
query.make_query_properties(),
|
|
1529
|
+
make_query_subjects(
|
|
1530
|
+
query.get_subjects(
|
|
1531
|
+
downstream_urn=downstream_urn,
|
|
1532
|
+
include_fields=self.generate_query_subject_fields,
|
|
1533
|
+
)
|
|
1473
1534
|
),
|
|
1474
1535
|
models.DataPlatformInstanceClass(
|
|
1475
1536
|
platform=self.platform.urn(),
|
|
@@ -1538,27 +1599,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1538
1599
|
|
|
1539
1600
|
@dataclasses.dataclass
|
|
1540
1601
|
class QueryLineageInfo:
|
|
1541
|
-
upstreams:
|
|
1542
|
-
|
|
1602
|
+
upstreams: OrderedSet[
|
|
1603
|
+
UrnStr
|
|
1604
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1605
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1543
1606
|
confidence_score: float
|
|
1544
1607
|
|
|
1545
1608
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1546
|
-
self.upstreams
|
|
1547
|
-
self.column_lineage
|
|
1609
|
+
self.upstreams.update(other_query.upstreams)
|
|
1610
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1548
1611
|
self.confidence_score = min(
|
|
1549
1612
|
self.confidence_score, other_query.confidence_score
|
|
1550
1613
|
)
|
|
1551
1614
|
|
|
1615
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1616
|
+
|
|
1552
1617
|
def _recurse_into_query(
|
|
1553
1618
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1554
1619
|
) -> QueryLineageInfo:
|
|
1555
1620
|
if query.query_id in recursion_path:
|
|
1556
1621
|
# This is a cycle, so we just return the query as-is.
|
|
1557
1622
|
return QueryLineageInfo(
|
|
1558
|
-
upstreams=query.upstreams,
|
|
1559
|
-
column_lineage=query.column_lineage,
|
|
1623
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1624
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1560
1625
|
confidence_score=query.confidence_score,
|
|
1561
1626
|
)
|
|
1627
|
+
if query.query_id in cache:
|
|
1628
|
+
return cache[query.query_id]
|
|
1562
1629
|
recursion_path = [*recursion_path, query.query_id]
|
|
1563
1630
|
composed_of_queries.add(query.query_id)
|
|
1564
1631
|
|
|
@@ -1573,7 +1640,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1573
1640
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1574
1641
|
if (
|
|
1575
1642
|
upstream_query
|
|
1576
|
-
and upstream_query.query_id not in
|
|
1643
|
+
and upstream_query.query_id not in recursion_path
|
|
1577
1644
|
):
|
|
1578
1645
|
temp_query_lineage_info = _recurse_into_query(
|
|
1579
1646
|
upstream_query, recursion_path
|
|
@@ -1633,11 +1700,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1633
1700
|
]
|
|
1634
1701
|
)
|
|
1635
1702
|
|
|
1636
|
-
|
|
1637
|
-
upstreams=
|
|
1638
|
-
column_lineage=new_cll,
|
|
1703
|
+
ret = QueryLineageInfo(
|
|
1704
|
+
upstreams=new_upstreams,
|
|
1705
|
+
column_lineage=OrderedSet(new_cll),
|
|
1639
1706
|
confidence_score=new_confidence_score,
|
|
1640
1707
|
)
|
|
1708
|
+
cache[query.query_id] = ret
|
|
1709
|
+
|
|
1710
|
+
return ret
|
|
1641
1711
|
|
|
1642
1712
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1643
1713
|
|
|
@@ -1670,20 +1740,30 @@ class SqlParsingAggregator(Closeable):
|
|
|
1670
1740
|
)
|
|
1671
1741
|
|
|
1672
1742
|
merged_query_text = ";\n\n".join(
|
|
1673
|
-
[q.formatted_query_string for q in ordered_queries]
|
|
1743
|
+
deduplicate_list([q.formatted_query_string for q in ordered_queries])
|
|
1674
1744
|
)
|
|
1675
1745
|
|
|
1676
1746
|
resolved_query = dataclasses.replace(
|
|
1677
1747
|
base_query,
|
|
1678
1748
|
query_id=composite_query_id,
|
|
1679
1749
|
formatted_query_string=merged_query_text,
|
|
1680
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1681
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1750
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1751
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1682
1752
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1683
1753
|
)
|
|
1684
1754
|
|
|
1685
1755
|
return resolved_query
|
|
1686
1756
|
|
|
1757
|
+
@staticmethod
|
|
1758
|
+
def _compute_upstream_fields(
|
|
1759
|
+
result: SqlParsingResult,
|
|
1760
|
+
) -> Dict[UrnStr, Set[UrnStr]]:
|
|
1761
|
+
upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
|
|
1762
|
+
for cl in result.column_lineage or []:
|
|
1763
|
+
for upstream in cl.upstreams:
|
|
1764
|
+
upstream_fields[upstream.table].add(upstream.column)
|
|
1765
|
+
return upstream_fields
|
|
1766
|
+
|
|
1687
1767
|
def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1688
1768
|
if not self._usage_aggregator:
|
|
1689
1769
|
return
|
|
@@ -1733,8 +1813,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1733
1813
|
operationType=operation_type,
|
|
1734
1814
|
lastUpdatedTimestamp=make_ts_millis(query.latest_timestamp),
|
|
1735
1815
|
actor=query.actor.urn() if query.actor else None,
|
|
1736
|
-
|
|
1737
|
-
|
|
1816
|
+
sourceType=models.OperationSourceTypeClass.DATA_PLATFORM,
|
|
1817
|
+
queries=(
|
|
1818
|
+
[self._query_urn(query_id)]
|
|
1738
1819
|
if self.can_generate_query(query_id)
|
|
1739
1820
|
else None
|
|
1740
1821
|
),
|