acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Iterable, List, Optional, Union
|
|
4
|
+
from typing import Iterable, List, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
|
-
from pydantic import StrictStr, validator
|
|
7
|
+
from pydantic import Field, StrictStr, validator
|
|
8
8
|
from ruamel.yaml import YAML
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import ConfigModel
|
|
@@ -48,7 +48,7 @@ VALID_ENTITY_TYPE_URNS = [
|
|
|
48
48
|
_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def _validate_entity_type_urn(v: str) -> str:
|
|
51
|
+
def _validate_entity_type_urn(cls: Type, v: str) -> str:
|
|
52
52
|
urn = Urn.make_entity_type_urn(v)
|
|
53
53
|
if urn not in VALID_ENTITY_TYPE_URNS:
|
|
54
54
|
raise ValueError(
|
|
@@ -68,7 +68,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
|
|
|
68
68
|
|
|
69
69
|
class StructuredProperties(ConfigModel):
|
|
70
70
|
id: Optional[str] = None
|
|
71
|
-
urn: Optional[str] = None
|
|
71
|
+
urn: Optional[str] = Field(None, validate_default=True)
|
|
72
72
|
qualified_name: Optional[str] = None
|
|
73
73
|
type: str
|
|
74
74
|
value_entity_types: Optional[List[str]] = None
|
datahub/api/graphql/operation.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Optional
|
|
3
3
|
|
|
4
|
-
from gql import
|
|
4
|
+
from gql import GraphQLRequest
|
|
5
5
|
|
|
6
6
|
from datahub.api.graphql.base import BaseApi
|
|
7
7
|
|
|
@@ -79,10 +79,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
79
79
|
if custom_properties is not None:
|
|
80
80
|
variable_values["customProperties"] = custom_properties
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
request = GraphQLRequest(
|
|
83
|
+
Operation.REPORT_OPERATION_MUTATION, variable_values=variable_values
|
|
84
84
|
)
|
|
85
85
|
|
|
86
|
+
result = self.client.execute(request)
|
|
87
|
+
|
|
86
88
|
return result["reportOperation"]
|
|
87
89
|
|
|
88
90
|
def query_operations(
|
|
@@ -109,12 +111,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
109
111
|
:param partition: The partition to check the operation.
|
|
110
112
|
"""
|
|
111
113
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
+
request = GraphQLRequest(
|
|
115
|
+
Operation.QUERY_OPERATIONS,
|
|
114
116
|
variable_values={
|
|
115
117
|
"urn": urn,
|
|
116
118
|
"startTimeMillis": start_time_millis,
|
|
117
|
-
"
|
|
119
|
+
"endTimeMillis": end_time_millis,
|
|
118
120
|
"limit": limit,
|
|
119
121
|
"filter": self.gen_filter(
|
|
120
122
|
{
|
|
@@ -125,6 +127,8 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
125
127
|
),
|
|
126
128
|
},
|
|
127
129
|
)
|
|
130
|
+
|
|
131
|
+
result = self.client.execute(request)
|
|
128
132
|
if "dataset" in result and "operations" in result["dataset"]:
|
|
129
133
|
operations = []
|
|
130
134
|
if source_type is not None:
|
datahub/cli/check_cli.py
CHANGED
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
|
9
9
|
from typing import Any, Dict, List, Optional, Union
|
|
10
10
|
|
|
11
11
|
import click
|
|
12
|
+
from tabulate import tabulate
|
|
12
13
|
|
|
13
14
|
from datahub._version import __package_name__
|
|
14
15
|
from datahub.cli.json_file import check_mce_file
|
|
@@ -21,7 +22,7 @@ from datahub.ingestion.run.pipeline import Pipeline
|
|
|
21
22
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
22
23
|
from datahub.ingestion.source.source_registry import source_registry
|
|
23
24
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
24
|
-
from datahub.
|
|
25
|
+
from datahub.upgrade import upgrade
|
|
25
26
|
from datahub.utilities.file_backed_collections import (
|
|
26
27
|
ConnectionWrapper,
|
|
27
28
|
FileBackedDict,
|
|
@@ -47,7 +48,6 @@ def check() -> None:
|
|
|
47
48
|
@click.option(
|
|
48
49
|
"--unpack-mces", default=False, is_flag=True, help="Converts MCEs into MCPs"
|
|
49
50
|
)
|
|
50
|
-
@telemetry.with_telemetry()
|
|
51
51
|
def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
|
|
52
52
|
"""Check the schema of a metadata (MCE or MCP) JSON file."""
|
|
53
53
|
|
|
@@ -105,7 +105,6 @@ def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
|
|
|
105
105
|
default=(),
|
|
106
106
|
help="[Advanced] Paths in the deepdiff object to ignore",
|
|
107
107
|
)
|
|
108
|
-
@telemetry.with_telemetry()
|
|
109
108
|
def metadata_diff(
|
|
110
109
|
actual_file: str, expected_file: str, verbose: bool, ignore_path: List[str]
|
|
111
110
|
) -> None:
|
|
@@ -142,7 +141,6 @@ def metadata_diff(
|
|
|
142
141
|
type=str,
|
|
143
142
|
default=None,
|
|
144
143
|
)
|
|
145
|
-
@telemetry.with_telemetry()
|
|
146
144
|
def plugins(source: Optional[str], verbose: bool) -> None:
|
|
147
145
|
"""List the enabled ingestion plugins."""
|
|
148
146
|
|
|
@@ -234,7 +232,7 @@ def sql_format(sql: str, platform: str) -> None:
|
|
|
234
232
|
default=True,
|
|
235
233
|
help="Run in offline mode and disable schema-aware parsing.",
|
|
236
234
|
)
|
|
237
|
-
@
|
|
235
|
+
@upgrade.check_upgrade
|
|
238
236
|
def sql_lineage(
|
|
239
237
|
sql: Optional[str],
|
|
240
238
|
sql_file: Optional[str],
|
|
@@ -297,7 +295,6 @@ def sql_lineage(
|
|
|
297
295
|
type=str,
|
|
298
296
|
help="the input to validate",
|
|
299
297
|
)
|
|
300
|
-
@telemetry.with_telemetry()
|
|
301
298
|
def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
|
|
302
299
|
"""Test input string against AllowDeny pattern in a DataHub recipe.
|
|
303
300
|
|
|
@@ -346,7 +343,6 @@ def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
|
|
|
346
343
|
type=str,
|
|
347
344
|
help="The input to validate",
|
|
348
345
|
)
|
|
349
|
-
@telemetry.with_telemetry()
|
|
350
346
|
def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
|
|
351
347
|
"""Test input path string against PathSpec patterns in a DataHub recipe.
|
|
352
348
|
|
|
@@ -471,6 +467,7 @@ WHERE
|
|
|
471
467
|
|
|
472
468
|
|
|
473
469
|
@check.command()
|
|
470
|
+
@upgrade.check_upgrade
|
|
474
471
|
def server_config() -> None:
|
|
475
472
|
"""Print the server config."""
|
|
476
473
|
graph = get_default_graph(ClientMode.CLI)
|
|
@@ -478,3 +475,87 @@ def server_config() -> None:
|
|
|
478
475
|
server_config = graph.get_server_config()
|
|
479
476
|
|
|
480
477
|
click.echo(pprint.pformat(server_config))
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@check.command()
|
|
481
|
+
@click.option(
|
|
482
|
+
"--urn", required=False, help="The urn or urn pattern (supports % for wildcard)"
|
|
483
|
+
)
|
|
484
|
+
@click.option("--aspect", default=None, help="Filter to a specific aspect name.")
|
|
485
|
+
@click.option(
|
|
486
|
+
"--start", type=int, default=None, help="Row number of sql store to restore from."
|
|
487
|
+
)
|
|
488
|
+
@click.option("--batch-size", type=int, default=None, help="How many rows to restore.")
|
|
489
|
+
@click.option(
|
|
490
|
+
"--file",
|
|
491
|
+
required=False,
|
|
492
|
+
type=click.Path(exists=True, dir_okay=True, readable=True),
|
|
493
|
+
help="File absolute path containing URNs (one per line) to restore indices",
|
|
494
|
+
)
|
|
495
|
+
@upgrade.check_upgrade
|
|
496
|
+
def restore_indices(
|
|
497
|
+
urn: Optional[str],
|
|
498
|
+
aspect: Optional[str],
|
|
499
|
+
start: Optional[int],
|
|
500
|
+
batch_size: Optional[int],
|
|
501
|
+
file: Optional[str],
|
|
502
|
+
) -> None:
|
|
503
|
+
"""Resync metadata changes into the search and graph indices."""
|
|
504
|
+
if urn is None and file is None:
|
|
505
|
+
raise click.UsageError("Either --urn or --file must be provided")
|
|
506
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
507
|
+
|
|
508
|
+
graph.restore_indices(
|
|
509
|
+
urn_pattern=urn,
|
|
510
|
+
aspect=aspect,
|
|
511
|
+
start=start,
|
|
512
|
+
batch_size=batch_size,
|
|
513
|
+
file=file,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
@check.command()
|
|
518
|
+
@upgrade.check_upgrade
|
|
519
|
+
def get_kafka_consumer_offsets() -> None:
|
|
520
|
+
"""Get Kafka consumer offsets from the DataHub API."""
|
|
521
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
522
|
+
result = graph.get_kafka_consumer_offsets()
|
|
523
|
+
|
|
524
|
+
table_data = []
|
|
525
|
+
headers = [
|
|
526
|
+
"Topic",
|
|
527
|
+
"Consumer Group",
|
|
528
|
+
"Schema",
|
|
529
|
+
"Partition",
|
|
530
|
+
"Offset",
|
|
531
|
+
"Lag",
|
|
532
|
+
"Avg Lag",
|
|
533
|
+
"Max Lag",
|
|
534
|
+
"Total Lag",
|
|
535
|
+
]
|
|
536
|
+
|
|
537
|
+
for topic, consumers in result.items():
|
|
538
|
+
for consumer_group, schemas in consumers.items():
|
|
539
|
+
for schema, data in schemas.items():
|
|
540
|
+
metrics = data.get("metrics", {})
|
|
541
|
+
partitions = data.get("partitions", {})
|
|
542
|
+
|
|
543
|
+
for partition, partition_data in partitions.items():
|
|
544
|
+
table_data.append(
|
|
545
|
+
[
|
|
546
|
+
topic,
|
|
547
|
+
consumer_group,
|
|
548
|
+
schema,
|
|
549
|
+
partition,
|
|
550
|
+
partition_data.get("offset", "N/A"),
|
|
551
|
+
partition_data.get("lag", "N/A"),
|
|
552
|
+
metrics.get("avgLag", "N/A"),
|
|
553
|
+
metrics.get("maxLag", "N/A"),
|
|
554
|
+
metrics.get("totalLag", "N/A"),
|
|
555
|
+
]
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
if table_data:
|
|
559
|
+
click.echo(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
560
|
+
else:
|
|
561
|
+
click.echo("No Kafka consumer offset data found.")
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import time
|
|
4
4
|
import typing
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from functools import wraps
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
7
8
|
|
|
8
9
|
import click
|
|
@@ -424,3 +425,65 @@ def ensure_has_system_metadata(
|
|
|
424
425
|
props = metadata.properties
|
|
425
426
|
props["clientId"] = datahub_version.__package_name__
|
|
426
427
|
props["clientVersion"] = datahub_version.__version__
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def enable_auto_decorators(main_group: click.Group) -> None:
|
|
431
|
+
"""
|
|
432
|
+
Enable automatic decorators for all click commands.
|
|
433
|
+
This wraps existing command callback functions to add upgrade and telemetry decorators.
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
def has_decorator(func: Any, module_pattern: str, function_pattern: str) -> bool:
|
|
437
|
+
"""Check if function already has a specific decorator"""
|
|
438
|
+
if hasattr(func, "__wrapped__"):
|
|
439
|
+
current_func = func
|
|
440
|
+
while hasattr(current_func, "__wrapped__"):
|
|
441
|
+
# Check if this wrapper matches the module and function patterns
|
|
442
|
+
if (
|
|
443
|
+
hasattr(current_func, "__module__")
|
|
444
|
+
and module_pattern in current_func.__module__
|
|
445
|
+
and hasattr(current_func, "__name__")
|
|
446
|
+
and function_pattern in current_func.__name__
|
|
447
|
+
):
|
|
448
|
+
return True
|
|
449
|
+
current_func = current_func.__wrapped__
|
|
450
|
+
return False
|
|
451
|
+
|
|
452
|
+
def has_telemetry_decorator(func):
|
|
453
|
+
return has_decorator(func, "telemetry", "with_telemetry")
|
|
454
|
+
|
|
455
|
+
def wrap_command_callback(command_obj):
|
|
456
|
+
"""Wrap a command's callback function to add decorators"""
|
|
457
|
+
if hasattr(command_obj, "callback") and command_obj.callback:
|
|
458
|
+
original_callback = command_obj.callback
|
|
459
|
+
|
|
460
|
+
# Import here to avoid circular imports
|
|
461
|
+
from datahub.telemetry import telemetry
|
|
462
|
+
|
|
463
|
+
decorated_callback = original_callback
|
|
464
|
+
|
|
465
|
+
if not has_telemetry_decorator(decorated_callback):
|
|
466
|
+
log.debug(
|
|
467
|
+
f"Applying telemetry decorator to {original_callback.__module__}.{original_callback.__name__}"
|
|
468
|
+
)
|
|
469
|
+
decorated_callback = telemetry.with_telemetry()(decorated_callback)
|
|
470
|
+
|
|
471
|
+
# Preserve the original function's metadata
|
|
472
|
+
decorated_callback = wraps(original_callback)(decorated_callback)
|
|
473
|
+
|
|
474
|
+
command_obj.callback = decorated_callback
|
|
475
|
+
|
|
476
|
+
def wrap_group_commands(group_obj):
|
|
477
|
+
"""Recursively wrap all commands in a group"""
|
|
478
|
+
if hasattr(group_obj, "commands"):
|
|
479
|
+
for _, command_obj in group_obj.commands.items():
|
|
480
|
+
if isinstance(command_obj, click.Group):
|
|
481
|
+
# Recursively wrap sub-groups
|
|
482
|
+
wrap_group_commands(command_obj)
|
|
483
|
+
else:
|
|
484
|
+
# Wrap individual commands
|
|
485
|
+
wrap_command_callback(command_obj)
|
|
486
|
+
|
|
487
|
+
wrap_group_commands(main_group)
|
|
488
|
+
|
|
489
|
+
log.debug("Auto-decorators enabled successfully")
|
datahub/cli/config_utils.py
CHANGED
|
@@ -11,7 +11,16 @@ import click
|
|
|
11
11
|
import yaml
|
|
12
12
|
from pydantic import BaseModel, ValidationError
|
|
13
13
|
|
|
14
|
-
from datahub.
|
|
14
|
+
from datahub.configuration.env_vars import (
|
|
15
|
+
get_gms_host,
|
|
16
|
+
get_gms_port,
|
|
17
|
+
get_gms_protocol,
|
|
18
|
+
get_gms_token,
|
|
19
|
+
get_gms_url,
|
|
20
|
+
get_skip_config,
|
|
21
|
+
get_system_client_id,
|
|
22
|
+
get_system_client_secret,
|
|
23
|
+
)
|
|
15
24
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
16
25
|
|
|
17
26
|
logger = logging.getLogger(__name__)
|
|
@@ -36,15 +45,15 @@ class MissingConfigError(Exception):
|
|
|
36
45
|
|
|
37
46
|
|
|
38
47
|
def get_system_auth() -> Optional[str]:
|
|
39
|
-
system_client_id =
|
|
40
|
-
system_client_secret =
|
|
48
|
+
system_client_id = get_system_client_id()
|
|
49
|
+
system_client_secret = get_system_client_secret()
|
|
41
50
|
if system_client_id is not None and system_client_secret is not None:
|
|
42
51
|
return f"Basic {system_client_id}:{system_client_secret}"
|
|
43
52
|
return None
|
|
44
53
|
|
|
45
54
|
|
|
46
55
|
def _should_skip_config() -> bool:
|
|
47
|
-
return
|
|
56
|
+
return get_skip_config()
|
|
48
57
|
|
|
49
58
|
|
|
50
59
|
def persist_raw_datahub_config(config: dict) -> None:
|
|
@@ -67,11 +76,11 @@ class DatahubConfig(BaseModel):
|
|
|
67
76
|
|
|
68
77
|
|
|
69
78
|
def _get_config_from_env() -> Tuple[Optional[str], Optional[str]]:
|
|
70
|
-
host =
|
|
71
|
-
port =
|
|
72
|
-
token =
|
|
73
|
-
protocol =
|
|
74
|
-
url =
|
|
79
|
+
host = get_gms_host()
|
|
80
|
+
port = get_gms_port()
|
|
81
|
+
token = get_gms_token()
|
|
82
|
+
protocol = get_gms_protocol()
|
|
83
|
+
url = get_gms_url()
|
|
75
84
|
if port is not None:
|
|
76
85
|
url = f"{protocol}://{host}:{port}"
|
|
77
86
|
return url, token
|
|
@@ -108,7 +117,6 @@ def load_client_config() -> DatahubClientConfig:
|
|
|
108
117
|
datahub_config: DatahubClientConfig = DatahubConfig.parse_obj(
|
|
109
118
|
client_config_dict
|
|
110
119
|
).gms
|
|
111
|
-
|
|
112
120
|
return datahub_config
|
|
113
121
|
except ValidationError as e:
|
|
114
122
|
click.echo(f"Error loading your {CONDENSED_DATAHUB_CONFIG_PATH}")
|
datahub/cli/container_cli.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import click
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
|
|
6
|
+
from datahub.upgrade import upgrade
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
@@ -16,6 +17,7 @@ def container() -> None:
|
|
|
16
17
|
@container.command()
|
|
17
18
|
@click.option("--container-urn", required=True, type=str)
|
|
18
19
|
@click.option("--tag-urn", required=True, type=str)
|
|
20
|
+
@upgrade.check_upgrade
|
|
19
21
|
def tag(container_urn: str, tag_urn: str) -> None:
|
|
20
22
|
"""Add patch to add a tag to all datasets in a container"""
|
|
21
23
|
apply_association_to_container(container_urn, tag_urn, "tag")
|
|
@@ -24,6 +26,7 @@ def tag(container_urn: str, tag_urn: str) -> None:
|
|
|
24
26
|
@container.command()
|
|
25
27
|
@click.option("--container-urn", required=True, type=str)
|
|
26
28
|
@click.option("--term-urn", required=True, type=str)
|
|
29
|
+
@upgrade.check_upgrade
|
|
27
30
|
def term(container_urn: str, term_urn: str) -> None:
|
|
28
31
|
"""Add patch to add a term to all datasets in a container"""
|
|
29
32
|
apply_association_to_container(container_urn, term_urn, "term")
|
|
@@ -32,6 +35,7 @@ def term(container_urn: str, term_urn: str) -> None:
|
|
|
32
35
|
@container.command()
|
|
33
36
|
@click.option("--container-urn", required=True, type=str)
|
|
34
37
|
@click.option("--owner-urn", required=True, type=str)
|
|
38
|
+
@upgrade.check_upgrade
|
|
35
39
|
def owner(container_urn: str, owner_urn: str) -> None:
|
|
36
40
|
"""Add patch to add a owner to all datasets in a container"""
|
|
37
41
|
apply_association_to_container(container_urn, owner_urn, "owner")
|
|
@@ -40,6 +44,7 @@ def owner(container_urn: str, owner_urn: str) -> None:
|
|
|
40
44
|
@container.command()
|
|
41
45
|
@click.option("--container-urn", required=True, type=str)
|
|
42
46
|
@click.option("--domain-urn", required=True, type=str)
|
|
47
|
+
@upgrade.check_upgrade
|
|
43
48
|
def domain(container_urn: str, domain_urn: str) -> None:
|
|
44
49
|
"""Add patch to add a domain to all datasets in a container"""
|
|
45
50
|
apply_association_to_container(container_urn, domain_urn, "domain")
|
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import random
|
|
3
|
+
import sys
|
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime
|
|
@@ -17,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
|
|
|
17
18
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
18
19
|
from datahub.ingestion.graph.config import ClientMode
|
|
19
20
|
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
20
|
-
from datahub.telemetry import telemetry
|
|
21
21
|
from datahub.upgrade import upgrade
|
|
22
22
|
from datahub.utilities.perf_timer import PerfTimer
|
|
23
23
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
@@ -115,7 +115,7 @@ class DeletionResult:
|
|
|
115
115
|
help="specifies soft/hard deletion",
|
|
116
116
|
)
|
|
117
117
|
@click.option("-n", "--dry-run", required=False, is_flag=True)
|
|
118
|
-
@
|
|
118
|
+
@upgrade.check_upgrade
|
|
119
119
|
def by_registry(
|
|
120
120
|
registry_id: str,
|
|
121
121
|
soft: bool,
|
|
@@ -170,7 +170,7 @@ def by_registry(
|
|
|
170
170
|
@click.option(
|
|
171
171
|
"-f", "--force", required=False, is_flag=True, help="force the delete if set"
|
|
172
172
|
)
|
|
173
|
-
@
|
|
173
|
+
@upgrade.check_upgrade
|
|
174
174
|
def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
175
175
|
"""
|
|
176
176
|
Delete all references to an entity (but not the entity itself).
|
|
@@ -231,8 +231,9 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
231
231
|
default=3000,
|
|
232
232
|
type=int,
|
|
233
233
|
help="Batch size when querying for entities to un-soft delete."
|
|
234
|
-
"Maximum
|
|
234
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
235
235
|
)
|
|
236
|
+
@upgrade.check_upgrade
|
|
236
237
|
def undo_by_filter(
|
|
237
238
|
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
238
239
|
) -> None:
|
|
@@ -317,6 +318,19 @@ def undo_by_filter(
|
|
|
317
318
|
is_flag=True,
|
|
318
319
|
help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
|
|
319
320
|
)
|
|
321
|
+
@click.option(
|
|
322
|
+
"--streaming-batch",
|
|
323
|
+
required=False,
|
|
324
|
+
is_flag=True,
|
|
325
|
+
help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"--streaming-batch-size",
|
|
329
|
+
required=False,
|
|
330
|
+
default=12000,
|
|
331
|
+
type=int,
|
|
332
|
+
help="Batch size for streaming batch deletion for recursive operations.",
|
|
333
|
+
)
|
|
320
334
|
@click.option(
|
|
321
335
|
"--start-time",
|
|
322
336
|
required=False,
|
|
@@ -336,7 +350,7 @@ def undo_by_filter(
|
|
|
336
350
|
default=3000,
|
|
337
351
|
type=int,
|
|
338
352
|
help="Batch size when querying for entities to delete."
|
|
339
|
-
"Maximum
|
|
353
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
340
354
|
)
|
|
341
355
|
@click.option(
|
|
342
356
|
"-n",
|
|
@@ -356,7 +370,6 @@ def undo_by_filter(
|
|
|
356
370
|
"--workers", type=int, default=1, help="Num of workers to use for deletion."
|
|
357
371
|
)
|
|
358
372
|
@upgrade.check_upgrade
|
|
359
|
-
@telemetry.with_telemetry()
|
|
360
373
|
def by_filter(
|
|
361
374
|
urn: Optional[str],
|
|
362
375
|
urn_file: Optional[str],
|
|
@@ -368,6 +381,8 @@ def by_filter(
|
|
|
368
381
|
entity_type: Optional[str],
|
|
369
382
|
query: Optional[str],
|
|
370
383
|
recursive: bool,
|
|
384
|
+
streaming_batch: bool,
|
|
385
|
+
streaming_batch_size: int,
|
|
371
386
|
start_time: Optional[datetime],
|
|
372
387
|
end_time: Optional[datetime],
|
|
373
388
|
batch_size: int,
|
|
@@ -386,6 +401,7 @@ def by_filter(
|
|
|
386
401
|
env=env,
|
|
387
402
|
query=query,
|
|
388
403
|
recursive=recursive,
|
|
404
|
+
streaming_batch=streaming_batch,
|
|
389
405
|
)
|
|
390
406
|
soft_delete_filter = _validate_user_soft_delete_flags(
|
|
391
407
|
soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
|
|
@@ -417,26 +433,27 @@ def by_filter(
|
|
|
417
433
|
# Determine which urns to delete.
|
|
418
434
|
delete_by_urn = bool(urn) and not recursive
|
|
419
435
|
if urn:
|
|
420
|
-
urns = [urn]
|
|
421
|
-
|
|
422
436
|
if recursive:
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
437
|
+
_delete_urns_streaming_recursive(
|
|
438
|
+
graph=graph,
|
|
439
|
+
parent_urn=urn,
|
|
440
|
+
aspect_name=aspect,
|
|
441
|
+
soft=soft,
|
|
442
|
+
dry_run=dry_run,
|
|
443
|
+
start_time=start_time,
|
|
444
|
+
end_time=end_time,
|
|
445
|
+
workers=workers,
|
|
446
|
+
soft_delete_filter=soft_delete_filter,
|
|
447
|
+
batch_size=batch_size,
|
|
448
|
+
force=force,
|
|
449
|
+
streaming_batch_size=streaming_batch_size
|
|
450
|
+
if streaming_batch
|
|
451
|
+
else sys.maxsize,
|
|
452
|
+
)
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
else:
|
|
456
|
+
urns = [urn]
|
|
440
457
|
elif urn_file:
|
|
441
458
|
with open(urn_file, "r") as r:
|
|
442
459
|
urns = []
|
|
@@ -452,6 +469,7 @@ def by_filter(
|
|
|
452
469
|
query=query,
|
|
453
470
|
status=soft_delete_filter,
|
|
454
471
|
batch_size=batch_size,
|
|
472
|
+
skip_cache=True,
|
|
455
473
|
)
|
|
456
474
|
)
|
|
457
475
|
if len(urns) == 0:
|
|
@@ -557,6 +575,7 @@ def _validate_user_urn_and_filters(
|
|
|
557
575
|
env: Optional[str],
|
|
558
576
|
query: Optional[str],
|
|
559
577
|
recursive: bool,
|
|
578
|
+
streaming_batch: bool,
|
|
560
579
|
) -> None:
|
|
561
580
|
# Check urn / filters options.
|
|
562
581
|
if urn:
|
|
@@ -592,6 +611,12 @@ def _validate_user_urn_and_filters(
|
|
|
592
611
|
f"This will only delete {urn}. Use --recursive to delete all contained entities."
|
|
593
612
|
)
|
|
594
613
|
|
|
614
|
+
# Check streaming flag.
|
|
615
|
+
if streaming_batch and not recursive:
|
|
616
|
+
raise click.UsageError(
|
|
617
|
+
"The --streaming-batch flag can only be used with --recursive."
|
|
618
|
+
)
|
|
619
|
+
|
|
595
620
|
|
|
596
621
|
def _validate_user_soft_delete_flags(
|
|
597
622
|
soft: bool, aspect: Optional[str], only_soft_deleted: bool
|
|
@@ -654,8 +679,8 @@ def _validate_user_aspect_flags(
|
|
|
654
679
|
def _validate_batch_size(batch_size: int) -> None:
|
|
655
680
|
if batch_size <= 0:
|
|
656
681
|
raise click.UsageError("Batch size must be a positive integer.")
|
|
657
|
-
elif batch_size >
|
|
658
|
-
raise click.UsageError("Batch size cannot exceed
|
|
682
|
+
elif batch_size > 5000:
|
|
683
|
+
raise click.UsageError("Batch size cannot exceed 5,000.")
|
|
659
684
|
|
|
660
685
|
|
|
661
686
|
def _delete_one_urn(
|
|
@@ -738,3 +763,76 @@ def _delete_one_urn(
|
|
|
738
763
|
num_timeseries_records=ts_rows_affected,
|
|
739
764
|
num_referenced_entities=referenced_entities_affected,
|
|
740
765
|
)
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _delete_urns_streaming_recursive(
|
|
769
|
+
graph: DataHubGraph,
|
|
770
|
+
parent_urn: str,
|
|
771
|
+
aspect_name: Optional[str],
|
|
772
|
+
soft: bool,
|
|
773
|
+
dry_run: bool,
|
|
774
|
+
start_time: Optional[datetime],
|
|
775
|
+
end_time: Optional[datetime],
|
|
776
|
+
workers: int,
|
|
777
|
+
soft_delete_filter: RemovedStatusFilter,
|
|
778
|
+
batch_size: int,
|
|
779
|
+
force: bool,
|
|
780
|
+
streaming_batch_size: int,
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Streaming recursive batch deletion that processes URNs in batches."""
|
|
783
|
+
|
|
784
|
+
entity_type = guess_entity_type(parent_urn)
|
|
785
|
+
click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
|
|
786
|
+
|
|
787
|
+
if not force and not dry_run:
|
|
788
|
+
click.confirm(
|
|
789
|
+
f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
|
|
790
|
+
abort=True,
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
urns = []
|
|
794
|
+
|
|
795
|
+
if entity_type == "dataPlatformInstance":
|
|
796
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
797
|
+
platform_instance=parent_urn,
|
|
798
|
+
status=soft_delete_filter,
|
|
799
|
+
batch_size=batch_size,
|
|
800
|
+
# Important to skip cache so we can resume from where we left off.
|
|
801
|
+
skip_cache=True,
|
|
802
|
+
)
|
|
803
|
+
else:
|
|
804
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
805
|
+
container=parent_urn,
|
|
806
|
+
status=soft_delete_filter,
|
|
807
|
+
batch_size=batch_size,
|
|
808
|
+
# Important to skip cache so we can resume from where we left off.
|
|
809
|
+
skip_cache=True,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
for child_urn in child_urns_iter:
|
|
813
|
+
urns.append(child_urn)
|
|
814
|
+
if len(urns) >= streaming_batch_size:
|
|
815
|
+
_delete_urns_parallel(
|
|
816
|
+
graph=graph,
|
|
817
|
+
urns=urns,
|
|
818
|
+
aspect_name=aspect_name,
|
|
819
|
+
soft=soft,
|
|
820
|
+
dry_run=dry_run,
|
|
821
|
+
delete_by_urn=False,
|
|
822
|
+
start_time=start_time,
|
|
823
|
+
end_time=end_time,
|
|
824
|
+
workers=workers,
|
|
825
|
+
)
|
|
826
|
+
urns = []
|
|
827
|
+
urns.append(parent_urn)
|
|
828
|
+
_delete_urns_parallel(
|
|
829
|
+
graph=graph,
|
|
830
|
+
urns=urns,
|
|
831
|
+
aspect_name=aspect_name,
|
|
832
|
+
soft=soft,
|
|
833
|
+
dry_run=dry_run,
|
|
834
|
+
delete_by_urn=False,
|
|
835
|
+
start_time=start_time,
|
|
836
|
+
end_time=end_time,
|
|
837
|
+
workers=workers,
|
|
838
|
+
)
|