acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/report.py
CHANGED
|
@@ -2,18 +2,34 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import pprint
|
|
5
|
-
from
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime, timedelta
|
|
7
8
|
from enum import Enum
|
|
8
|
-
from typing import Any, Optional, runtime_checkable
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
|
|
9
10
|
|
|
10
11
|
import humanfriendly
|
|
11
12
|
import pydantic
|
|
12
13
|
from pydantic import BaseModel
|
|
14
|
+
from tabulate import tabulate
|
|
13
15
|
from typing_extensions import Literal, Protocol
|
|
14
16
|
|
|
17
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
|
+
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
19
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
15
20
|
from datahub.ingestion.api.report_helpers import format_datetime_relative
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
MetadataChangeProposalClass,
|
|
26
|
+
StatusClass,
|
|
27
|
+
SubTypesClass,
|
|
28
|
+
UpstreamLineageClass,
|
|
29
|
+
)
|
|
30
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
16
31
|
from datahub.utilities.lossy_collections import LossyList
|
|
32
|
+
from datahub.utilities.urns.urn import guess_platform_name
|
|
17
33
|
|
|
18
34
|
logger = logging.getLogger(__name__)
|
|
19
35
|
LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
|
|
@@ -26,6 +42,15 @@ class SupportsAsObj(Protocol):
|
|
|
26
42
|
|
|
27
43
|
@dataclass
|
|
28
44
|
class Report(SupportsAsObj):
|
|
45
|
+
def __post_init__(self) -> None:
|
|
46
|
+
self.platform: Optional[str] = None
|
|
47
|
+
|
|
48
|
+
def set_platform(self, platform: str) -> None:
|
|
49
|
+
self.platform = platform
|
|
50
|
+
|
|
51
|
+
def get_platform(self) -> Optional[str]:
|
|
52
|
+
return self.platform
|
|
53
|
+
|
|
29
54
|
@staticmethod
|
|
30
55
|
def to_str(some_val: Any) -> str:
|
|
31
56
|
if isinstance(some_val, Enum):
|
|
@@ -82,7 +107,58 @@ class Report(SupportsAsObj):
|
|
|
82
107
|
}
|
|
83
108
|
|
|
84
109
|
def as_string(self) -> str:
|
|
85
|
-
|
|
110
|
+
self_obj = self.as_obj()
|
|
111
|
+
_aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
|
|
112
|
+
|
|
113
|
+
# Format the main report data
|
|
114
|
+
result = pprint.pformat(self_obj, width=150, sort_dicts=False)
|
|
115
|
+
|
|
116
|
+
# Add aspects_by_subtypes table if it exists
|
|
117
|
+
if _aspects_by_subtypes:
|
|
118
|
+
result += "\n\nAspects by Subtypes:\n"
|
|
119
|
+
result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
|
|
120
|
+
|
|
121
|
+
return result
|
|
122
|
+
|
|
123
|
+
def _format_aspects_by_subtypes_table(
|
|
124
|
+
self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
|
|
125
|
+
) -> str:
|
|
126
|
+
"""Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
|
|
127
|
+
if not aspects_by_subtypes:
|
|
128
|
+
return "No aspects by subtypes data available."
|
|
129
|
+
|
|
130
|
+
all_aspects: set[str] = {
|
|
131
|
+
aspect
|
|
132
|
+
for subtypes in aspects_by_subtypes.values()
|
|
133
|
+
for aspects in subtypes.values()
|
|
134
|
+
for aspect in aspects
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
aspect_rows = sorted(all_aspects)
|
|
138
|
+
|
|
139
|
+
entity_subtype_columns = []
|
|
140
|
+
for entity_type, subtypes in aspects_by_subtypes.items():
|
|
141
|
+
for subtype in subtypes:
|
|
142
|
+
entity_subtype_columns.append(f"{entity_type} ({subtype})")
|
|
143
|
+
|
|
144
|
+
entity_subtype_columns.sort()
|
|
145
|
+
|
|
146
|
+
headers = ["Aspect"] + entity_subtype_columns
|
|
147
|
+
|
|
148
|
+
table_data = [
|
|
149
|
+
[aspect]
|
|
150
|
+
+ [
|
|
151
|
+
aspects.get(aspect, 0)
|
|
152
|
+
for subtypes in aspects_by_subtypes.values()
|
|
153
|
+
for aspects in subtypes.values()
|
|
154
|
+
]
|
|
155
|
+
for aspect in aspect_rows
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
if table_data:
|
|
159
|
+
return tabulate(table_data, headers=headers, tablefmt="grid")
|
|
160
|
+
else:
|
|
161
|
+
return "No aspects by subtypes data available."
|
|
86
162
|
|
|
87
163
|
def as_json(self) -> str:
|
|
88
164
|
return json.dumps(self.as_obj())
|
|
@@ -90,6 +166,15 @@ class Report(SupportsAsObj):
|
|
|
90
166
|
# TODO add helper method for warning / failure status + counts?
|
|
91
167
|
|
|
92
168
|
|
|
169
|
+
@dataclass
|
|
170
|
+
class SourceReportSubtypes:
|
|
171
|
+
urn: str
|
|
172
|
+
entity_type: str
|
|
173
|
+
subType: str = field(default="unknown")
|
|
174
|
+
aspects: Dict[str, int] = field(default_factory=dict)
|
|
175
|
+
soft_deleted: bool = field(default=False)
|
|
176
|
+
|
|
177
|
+
|
|
93
178
|
class ReportAttribute(BaseModel):
|
|
94
179
|
severity: LogLevel = "DEBUG"
|
|
95
180
|
help: Optional[str] = None
|
|
@@ -108,6 +193,299 @@ class ReportAttribute(BaseModel):
|
|
|
108
193
|
logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
|
|
109
194
|
|
|
110
195
|
|
|
196
|
+
@dataclass
|
|
197
|
+
class ExamplesReport(Report, Closeable):
|
|
198
|
+
aspects: Dict[str, Dict[str, int]] = field(
|
|
199
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
200
|
+
)
|
|
201
|
+
# This counts existence of aspects for each entity/subtype
|
|
202
|
+
# This is used for the UI to calculate %age of entities with the aspect
|
|
203
|
+
aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
204
|
+
default_factory=lambda: defaultdict(
|
|
205
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
# This counts all aspects for each entity/subtype
|
|
209
|
+
aspects_by_subtypes_full_count: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
210
|
+
default_factory=lambda: defaultdict(
|
|
211
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
samples: Dict[str, Dict[str, List[str]]] = field(
|
|
215
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(list))
|
|
216
|
+
)
|
|
217
|
+
compute_stats_time_seconds: float = 0.0
|
|
218
|
+
_file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
|
|
219
|
+
|
|
220
|
+
# We are adding this to make querying easier for fine-grained lineage
|
|
221
|
+
_fine_grained_lineage_special_case_name = "fineGrainedLineages"
|
|
222
|
+
_samples_to_add: int = 20
|
|
223
|
+
_lineage_aspects_seen: Set[str] = field(default_factory=set)
|
|
224
|
+
|
|
225
|
+
def __post_init__(self) -> None:
|
|
226
|
+
super().__post_init__()
|
|
227
|
+
self._file_based_dict = FileBackedDict(
|
|
228
|
+
tablename="urn_aspects",
|
|
229
|
+
extra_columns={
|
|
230
|
+
"urn": lambda val: val.urn,
|
|
231
|
+
"entityType": lambda val: val.entity_type,
|
|
232
|
+
"subTypes": lambda val: val.subType,
|
|
233
|
+
"aspects": lambda val: json.dumps(val.aspects),
|
|
234
|
+
"soft_deleted": lambda val: val.soft_deleted,
|
|
235
|
+
},
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def close(self) -> None:
|
|
239
|
+
self.compute_stats()
|
|
240
|
+
if self._file_based_dict is not None:
|
|
241
|
+
self._file_based_dict.close()
|
|
242
|
+
self._file_based_dict = None
|
|
243
|
+
|
|
244
|
+
def _build_aspects_where_clause(self, aspects: List[str]) -> str:
|
|
245
|
+
"""Build WHERE clause for matching any of the given aspects."""
|
|
246
|
+
if not aspects:
|
|
247
|
+
return ""
|
|
248
|
+
|
|
249
|
+
conditions = []
|
|
250
|
+
for aspect in aspects:
|
|
251
|
+
conditions.append(f"aspects LIKE '%{aspect}%'")
|
|
252
|
+
|
|
253
|
+
return " OR ".join(conditions)
|
|
254
|
+
|
|
255
|
+
def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
|
|
256
|
+
"""Helper method to collect samples organized by subtype for a given where clause."""
|
|
257
|
+
|
|
258
|
+
subtype_query = f"""
|
|
259
|
+
SELECT DISTINCT subTypes
|
|
260
|
+
FROM urn_aspects
|
|
261
|
+
WHERE {where_clause}
|
|
262
|
+
"""
|
|
263
|
+
assert self._file_based_dict is not None
|
|
264
|
+
subtypes = set()
|
|
265
|
+
for row in self._file_based_dict.sql_query(subtype_query):
|
|
266
|
+
sub_type = row["subTypes"] or "unknown"
|
|
267
|
+
subtypes.add(sub_type)
|
|
268
|
+
|
|
269
|
+
for sub_type in subtypes:
|
|
270
|
+
query = f"""
|
|
271
|
+
SELECT urn
|
|
272
|
+
FROM urn_aspects
|
|
273
|
+
WHERE {where_clause} AND subTypes = ?
|
|
274
|
+
limit {self._samples_to_add}
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
for row in self._file_based_dict.sql_query(query, (sub_type,)):
|
|
278
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
279
|
+
|
|
280
|
+
def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
|
|
281
|
+
"""Helper method to collect samples for entities that have any of the given aspects."""
|
|
282
|
+
if not aspects:
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
where_clause = self._build_aspects_where_clause(aspects)
|
|
286
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
287
|
+
|
|
288
|
+
def _collect_samples_by_lineage_aspects(
|
|
289
|
+
self, aspects: List[str], sample_key: str
|
|
290
|
+
) -> None:
|
|
291
|
+
"""Helper method to collect samples for entities that have any of the given lineage aspects.
|
|
292
|
+
|
|
293
|
+
Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
|
|
294
|
+
"""
|
|
295
|
+
if not aspects:
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
lineage_conditions = []
|
|
299
|
+
for aspect in aspects:
|
|
300
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
301
|
+
|
|
302
|
+
where_clause = " OR ".join(lineage_conditions)
|
|
303
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
304
|
+
|
|
305
|
+
def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
|
|
306
|
+
"""
|
|
307
|
+
Collect samples for entities that have lineage, profiling, and usage aspects.
|
|
308
|
+
These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
|
|
309
|
+
"""
|
|
310
|
+
if not self._lineage_aspects_seen:
|
|
311
|
+
return
|
|
312
|
+
assert self._file_based_dict is not None
|
|
313
|
+
|
|
314
|
+
# Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
|
|
315
|
+
lineage_conditions = []
|
|
316
|
+
for aspect in self._lineage_aspects_seen:
|
|
317
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
318
|
+
lineage_where_clause = " OR ".join(lineage_conditions)
|
|
319
|
+
|
|
320
|
+
# Build profiling conditions using the same logic as _collect_samples_by_aspects
|
|
321
|
+
profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
|
|
322
|
+
|
|
323
|
+
# Build usage conditions using the same logic as _collect_samples_by_aspects
|
|
324
|
+
usage_where_clause = self._build_aspects_where_clause(
|
|
325
|
+
[
|
|
326
|
+
"datasetUsageStatistics",
|
|
327
|
+
"chartUsageStatistics",
|
|
328
|
+
"dashboardUsageStatistics",
|
|
329
|
+
]
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
query = f"""
|
|
333
|
+
SELECT urn, subTypes
|
|
334
|
+
FROM urn_aspects
|
|
335
|
+
WHERE ({lineage_where_clause})
|
|
336
|
+
AND ({profiling_where_clause})
|
|
337
|
+
AND ({usage_where_clause})
|
|
338
|
+
limit {self._samples_to_add}
|
|
339
|
+
"""
|
|
340
|
+
|
|
341
|
+
for row in self._file_based_dict.sql_query(query):
|
|
342
|
+
sub_type = row["subTypes"] or "unknown"
|
|
343
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
344
|
+
|
|
345
|
+
def _has_fine_grained_lineage(
|
|
346
|
+
self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
|
|
347
|
+
) -> bool:
|
|
348
|
+
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
349
|
+
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
350
|
+
if upstream_lineage.fineGrainedLineages:
|
|
351
|
+
return True
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
def _update_file_based_dict(
|
|
355
|
+
self,
|
|
356
|
+
urn: str,
|
|
357
|
+
entityType: str,
|
|
358
|
+
aspectName: str,
|
|
359
|
+
mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
|
|
360
|
+
) -> None:
|
|
361
|
+
platform_name = guess_platform_name(urn)
|
|
362
|
+
if platform_name != self.get_platform():
|
|
363
|
+
return
|
|
364
|
+
if is_lineage_aspect(entityType, aspectName):
|
|
365
|
+
self._lineage_aspects_seen.add(aspectName)
|
|
366
|
+
has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
|
|
367
|
+
|
|
368
|
+
sub_type = "unknown"
|
|
369
|
+
if isinstance(mcp.aspect, SubTypesClass):
|
|
370
|
+
sub_type = mcp.aspect.typeNames[0]
|
|
371
|
+
|
|
372
|
+
assert self._file_based_dict is not None
|
|
373
|
+
if urn in self._file_based_dict:
|
|
374
|
+
if sub_type != "unknown":
|
|
375
|
+
self._file_based_dict[urn].subType = sub_type
|
|
376
|
+
aspects_dict = self._file_based_dict[urn].aspects
|
|
377
|
+
if aspectName in aspects_dict:
|
|
378
|
+
aspects_dict[aspectName] += 1
|
|
379
|
+
else:
|
|
380
|
+
aspects_dict[aspectName] = 1
|
|
381
|
+
if has_fine_grained_lineage:
|
|
382
|
+
if self._fine_grained_lineage_special_case_name in aspects_dict:
|
|
383
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] += 1
|
|
384
|
+
else:
|
|
385
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
386
|
+
self._file_based_dict.mark_dirty(urn)
|
|
387
|
+
else:
|
|
388
|
+
aspects_dict = {aspectName: 1}
|
|
389
|
+
if has_fine_grained_lineage:
|
|
390
|
+
aspects_dict[self._fine_grained_lineage_special_case_name] = 1
|
|
391
|
+
self._file_based_dict[urn] = SourceReportSubtypes(
|
|
392
|
+
urn=urn,
|
|
393
|
+
entity_type=entityType,
|
|
394
|
+
subType=sub_type,
|
|
395
|
+
aspects=aspects_dict,
|
|
396
|
+
)
|
|
397
|
+
if (
|
|
398
|
+
isinstance(mcp.aspect, StatusClass)
|
|
399
|
+
and mcp is not None
|
|
400
|
+
and mcp.aspect is not None
|
|
401
|
+
):
|
|
402
|
+
self._file_based_dict[urn].soft_deleted = mcp.aspect.removed
|
|
403
|
+
self._file_based_dict.mark_dirty(urn)
|
|
404
|
+
|
|
405
|
+
def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
|
|
406
|
+
urn = wu.get_urn()
|
|
407
|
+
|
|
408
|
+
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
409
|
+
mcps = [wu.metadata]
|
|
410
|
+
else:
|
|
411
|
+
mcps = list(mcps_from_mce(wu.metadata))
|
|
412
|
+
|
|
413
|
+
for mcp in mcps:
|
|
414
|
+
entityType = mcp.entityType
|
|
415
|
+
aspectName = mcp.aspectName
|
|
416
|
+
|
|
417
|
+
if aspectName is None:
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
self._update_file_based_dict(urn, entityType, aspectName, mcp)
|
|
421
|
+
|
|
422
|
+
def compute_stats(self) -> None:
|
|
423
|
+
start_time = datetime.now()
|
|
424
|
+
if self._file_based_dict is None:
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
query = """
|
|
428
|
+
SELECT entityType, subTypes, aspects, count(*) as count
|
|
429
|
+
FROM urn_aspects
|
|
430
|
+
WHERE soft_deleted = 0
|
|
431
|
+
GROUP BY entityType, subTypes, aspects
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
435
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
436
|
+
)
|
|
437
|
+
entity_subtype_aspect_counts_exist: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
438
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
for row in self._file_based_dict.sql_query(query):
|
|
442
|
+
entity_type = row["entityType"]
|
|
443
|
+
sub_type = row["subTypes"]
|
|
444
|
+
count = row["count"]
|
|
445
|
+
aspects_raw = row["aspects"] or "[]"
|
|
446
|
+
|
|
447
|
+
aspects = json.loads(aspects_raw)
|
|
448
|
+
for aspect, aspect_count in aspects.items():
|
|
449
|
+
entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
|
|
450
|
+
aspect_count * count
|
|
451
|
+
)
|
|
452
|
+
entity_subtype_aspect_counts_exist[entity_type][sub_type][aspect] += (
|
|
453
|
+
count
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
self.aspects.clear()
|
|
457
|
+
self.aspects_by_subtypes.clear()
|
|
458
|
+
self.aspects_by_subtypes_full_count.clear()
|
|
459
|
+
for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
|
|
460
|
+
for sub_type, aspect_counts in subtype_counts.items():
|
|
461
|
+
for aspect, count in aspect_counts.items():
|
|
462
|
+
self.aspects[entity_type][aspect] += count
|
|
463
|
+
self.aspects_by_subtypes_full_count[entity_type][sub_type] = dict(
|
|
464
|
+
aspect_counts
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
for entity_type, subtype_counts in entity_subtype_aspect_counts_exist.items():
|
|
468
|
+
for sub_type, aspect_counts in subtype_counts.items():
|
|
469
|
+
self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
|
|
470
|
+
|
|
471
|
+
self.samples.clear()
|
|
472
|
+
self._collect_samples_by_aspects(["datasetProfile"], "profiling")
|
|
473
|
+
self._collect_samples_by_aspects(
|
|
474
|
+
[
|
|
475
|
+
"datasetUsageStatistics",
|
|
476
|
+
"chartUsageStatistics",
|
|
477
|
+
"dashboardUsageStatistics",
|
|
478
|
+
],
|
|
479
|
+
"usage",
|
|
480
|
+
)
|
|
481
|
+
self._collect_samples_by_lineage_aspects(
|
|
482
|
+
list(self._lineage_aspects_seen), "lineage"
|
|
483
|
+
)
|
|
484
|
+
self._collect_samples_with_all_conditions("all_3")
|
|
485
|
+
end_time = datetime.now()
|
|
486
|
+
self.compute_stats_time_seconds += (end_time - start_time).total_seconds()
|
|
487
|
+
|
|
488
|
+
|
|
111
489
|
class EntityFilterReport(ReportAttribute):
|
|
112
490
|
type: str
|
|
113
491
|
|
datahub/ingestion/api/sink.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
import logging
|
|
2
3
|
from abc import ABCMeta, abstractmethod
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import Any, Generic, Optional, Type, TypeVar, cast
|
|
5
|
+
from typing import Any, Callable, Generic, List, Optional, Type, TypeVar, cast
|
|
5
6
|
|
|
6
7
|
from typing_extensions import Self
|
|
7
8
|
|
|
@@ -12,6 +13,8 @@ from datahub.ingestion.api.report import Report
|
|
|
12
13
|
from datahub.utilities.lossy_collections import LossyList
|
|
13
14
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
14
15
|
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
15
18
|
|
|
16
19
|
@dataclass
|
|
17
20
|
class SinkReport(Report):
|
|
@@ -89,6 +92,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
89
92
|
ctx: PipelineContext
|
|
90
93
|
config: SinkConfig
|
|
91
94
|
report: SinkReportType
|
|
95
|
+
_pre_shutdown_callbacks: List[Callable[[], None]]
|
|
92
96
|
|
|
93
97
|
@classmethod
|
|
94
98
|
def get_config_class(cls) -> Type[SinkConfig]:
|
|
@@ -106,6 +110,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
106
110
|
self.ctx = ctx
|
|
107
111
|
self.config = config
|
|
108
112
|
self.report = self.get_report_class()()
|
|
113
|
+
self._pre_shutdown_callbacks = []
|
|
109
114
|
|
|
110
115
|
self.__post_init__()
|
|
111
116
|
|
|
@@ -144,8 +149,28 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
144
149
|
def get_report(self) -> SinkReportType:
|
|
145
150
|
return self.report
|
|
146
151
|
|
|
152
|
+
def register_pre_shutdown_callback(self, callback: Callable[[], None]) -> None:
|
|
153
|
+
"""Register a callback to be executed before the sink shuts down.
|
|
154
|
+
|
|
155
|
+
This is useful for components that need to send final reports or cleanup
|
|
156
|
+
operations before the sink's resources are released.
|
|
157
|
+
"""
|
|
158
|
+
self._pre_shutdown_callbacks.append(callback)
|
|
159
|
+
|
|
147
160
|
def close(self) -> None:
|
|
148
|
-
|
|
161
|
+
"""Close the sink and clean up resources.
|
|
162
|
+
|
|
163
|
+
This method executes any registered pre-shutdown callbacks before
|
|
164
|
+
performing the actual shutdown. Subclasses should override this method
|
|
165
|
+
to provide sink-specific cleanup logic while calling super().close()
|
|
166
|
+
to ensure callbacks are executed.
|
|
167
|
+
"""
|
|
168
|
+
# Execute pre-shutdown callbacks before shutdown
|
|
169
|
+
for callback in self._pre_shutdown_callbacks:
|
|
170
|
+
try:
|
|
171
|
+
callback()
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.warning(f"Pre-shutdown callback failed: {e}", exc_info=True)
|
|
149
174
|
|
|
150
175
|
def configured(self) -> str:
|
|
151
176
|
"""Override this method to output a human-readable and scrubbed version of the configured sink"""
|