acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import random
|
|
3
|
+
import sys
|
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime
|
|
@@ -15,8 +16,8 @@ from datahub.cli import cli_utils
|
|
|
15
16
|
from datahub.configuration.datetimes import ClickDatetime
|
|
16
17
|
from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
|
|
17
18
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
19
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
18
20
|
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
19
|
-
from datahub.telemetry import telemetry
|
|
20
21
|
from datahub.upgrade import upgrade
|
|
21
22
|
from datahub.utilities.perf_timer import PerfTimer
|
|
22
23
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
@@ -48,7 +49,7 @@ def delete() -> None:
|
|
|
48
49
|
|
|
49
50
|
See `datahub delete by-filter` for the list of available filters.
|
|
50
51
|
|
|
51
|
-
See https://
|
|
52
|
+
See https://docs.datahub.com/docs/how/delete-metadata for more detailed docs.
|
|
52
53
|
"""
|
|
53
54
|
pass
|
|
54
55
|
|
|
@@ -114,7 +115,7 @@ class DeletionResult:
|
|
|
114
115
|
help="specifies soft/hard deletion",
|
|
115
116
|
)
|
|
116
117
|
@click.option("-n", "--dry-run", required=False, is_flag=True)
|
|
117
|
-
@
|
|
118
|
+
@upgrade.check_upgrade
|
|
118
119
|
def by_registry(
|
|
119
120
|
registry_id: str,
|
|
120
121
|
soft: bool,
|
|
@@ -124,7 +125,7 @@ def by_registry(
|
|
|
124
125
|
Delete all metadata written using the given registry id and version pair.
|
|
125
126
|
"""
|
|
126
127
|
|
|
127
|
-
client = get_default_graph()
|
|
128
|
+
client = get_default_graph(ClientMode.CLI)
|
|
128
129
|
|
|
129
130
|
if soft and not dry_run:
|
|
130
131
|
raise click.UsageError(
|
|
@@ -169,13 +170,13 @@ def by_registry(
|
|
|
169
170
|
@click.option(
|
|
170
171
|
"-f", "--force", required=False, is_flag=True, help="force the delete if set"
|
|
171
172
|
)
|
|
172
|
-
@
|
|
173
|
+
@upgrade.check_upgrade
|
|
173
174
|
def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
174
175
|
"""
|
|
175
176
|
Delete all references to an entity (but not the entity itself).
|
|
176
177
|
"""
|
|
177
178
|
|
|
178
|
-
graph = get_default_graph()
|
|
179
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
179
180
|
logger.info(f"Using graph: {graph}")
|
|
180
181
|
|
|
181
182
|
references_count, related_aspects = graph.delete_references_to_urn(
|
|
@@ -230,15 +231,16 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
230
231
|
default=3000,
|
|
231
232
|
type=int,
|
|
232
233
|
help="Batch size when querying for entities to un-soft delete."
|
|
233
|
-
"Maximum
|
|
234
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
234
235
|
)
|
|
236
|
+
@upgrade.check_upgrade
|
|
235
237
|
def undo_by_filter(
|
|
236
238
|
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
237
239
|
) -> None:
|
|
238
240
|
"""
|
|
239
241
|
Undo soft deletion by filters
|
|
240
242
|
"""
|
|
241
|
-
graph = get_default_graph()
|
|
243
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
242
244
|
logger.info(f"Using {graph}")
|
|
243
245
|
if urn:
|
|
244
246
|
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
@@ -316,6 +318,19 @@ def undo_by_filter(
|
|
|
316
318
|
is_flag=True,
|
|
317
319
|
help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
|
|
318
320
|
)
|
|
321
|
+
@click.option(
|
|
322
|
+
"--streaming-batch",
|
|
323
|
+
required=False,
|
|
324
|
+
is_flag=True,
|
|
325
|
+
help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"--streaming-batch-size",
|
|
329
|
+
required=False,
|
|
330
|
+
default=12000,
|
|
331
|
+
type=int,
|
|
332
|
+
help="Batch size for streaming batch deletion for recursive operations.",
|
|
333
|
+
)
|
|
319
334
|
@click.option(
|
|
320
335
|
"--start-time",
|
|
321
336
|
required=False,
|
|
@@ -335,7 +350,7 @@ def undo_by_filter(
|
|
|
335
350
|
default=3000,
|
|
336
351
|
type=int,
|
|
337
352
|
help="Batch size when querying for entities to delete."
|
|
338
|
-
"Maximum
|
|
353
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
339
354
|
)
|
|
340
355
|
@click.option(
|
|
341
356
|
"-n",
|
|
@@ -355,7 +370,6 @@ def undo_by_filter(
|
|
|
355
370
|
"--workers", type=int, default=1, help="Num of workers to use for deletion."
|
|
356
371
|
)
|
|
357
372
|
@upgrade.check_upgrade
|
|
358
|
-
@telemetry.with_telemetry()
|
|
359
373
|
def by_filter(
|
|
360
374
|
urn: Optional[str],
|
|
361
375
|
urn_file: Optional[str],
|
|
@@ -367,6 +381,8 @@ def by_filter(
|
|
|
367
381
|
entity_type: Optional[str],
|
|
368
382
|
query: Optional[str],
|
|
369
383
|
recursive: bool,
|
|
384
|
+
streaming_batch: bool,
|
|
385
|
+
streaming_batch_size: int,
|
|
370
386
|
start_time: Optional[datetime],
|
|
371
387
|
end_time: Optional[datetime],
|
|
372
388
|
batch_size: int,
|
|
@@ -385,6 +401,7 @@ def by_filter(
|
|
|
385
401
|
env=env,
|
|
386
402
|
query=query,
|
|
387
403
|
recursive=recursive,
|
|
404
|
+
streaming_batch=streaming_batch,
|
|
388
405
|
)
|
|
389
406
|
soft_delete_filter = _validate_user_soft_delete_flags(
|
|
390
407
|
soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
|
|
@@ -395,8 +412,8 @@ def by_filter(
|
|
|
395
412
|
|
|
396
413
|
if not force and not soft and not dry_run:
|
|
397
414
|
message = (
|
|
398
|
-
"Hard deletion will permanently delete data
|
|
399
|
-
"We
|
|
415
|
+
"Hard deletion will permanently delete data and can significantly slow down your instance while being executed. "
|
|
416
|
+
"We strongly recommend using soft deletes instead. "
|
|
400
417
|
"Do you want to continue?"
|
|
401
418
|
)
|
|
402
419
|
if only_soft_deleted:
|
|
@@ -410,32 +427,33 @@ def by_filter(
|
|
|
410
427
|
abort=True,
|
|
411
428
|
)
|
|
412
429
|
|
|
413
|
-
graph = get_default_graph()
|
|
430
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
414
431
|
logger.info(f"Using {graph}")
|
|
415
432
|
|
|
416
433
|
# Determine which urns to delete.
|
|
417
434
|
delete_by_urn = bool(urn) and not recursive
|
|
418
435
|
if urn:
|
|
419
|
-
urns = [urn]
|
|
420
|
-
|
|
421
436
|
if recursive:
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
437
|
+
_delete_urns_streaming_recursive(
|
|
438
|
+
graph=graph,
|
|
439
|
+
parent_urn=urn,
|
|
440
|
+
aspect_name=aspect,
|
|
441
|
+
soft=soft,
|
|
442
|
+
dry_run=dry_run,
|
|
443
|
+
start_time=start_time,
|
|
444
|
+
end_time=end_time,
|
|
445
|
+
workers=workers,
|
|
446
|
+
soft_delete_filter=soft_delete_filter,
|
|
447
|
+
batch_size=batch_size,
|
|
448
|
+
force=force,
|
|
449
|
+
streaming_batch_size=streaming_batch_size
|
|
450
|
+
if streaming_batch
|
|
451
|
+
else sys.maxsize,
|
|
452
|
+
)
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
else:
|
|
456
|
+
urns = [urn]
|
|
439
457
|
elif urn_file:
|
|
440
458
|
with open(urn_file, "r") as r:
|
|
441
459
|
urns = []
|
|
@@ -451,6 +469,7 @@ def by_filter(
|
|
|
451
469
|
query=query,
|
|
452
470
|
status=soft_delete_filter,
|
|
453
471
|
batch_size=batch_size,
|
|
472
|
+
skip_cache=True,
|
|
454
473
|
)
|
|
455
474
|
)
|
|
456
475
|
if len(urns) == 0:
|
|
@@ -556,6 +575,7 @@ def _validate_user_urn_and_filters(
|
|
|
556
575
|
env: Optional[str],
|
|
557
576
|
query: Optional[str],
|
|
558
577
|
recursive: bool,
|
|
578
|
+
streaming_batch: bool,
|
|
559
579
|
) -> None:
|
|
560
580
|
# Check urn / filters options.
|
|
561
581
|
if urn:
|
|
@@ -591,6 +611,12 @@ def _validate_user_urn_and_filters(
|
|
|
591
611
|
f"This will only delete {urn}. Use --recursive to delete all contained entities."
|
|
592
612
|
)
|
|
593
613
|
|
|
614
|
+
# Check streaming flag.
|
|
615
|
+
if streaming_batch and not recursive:
|
|
616
|
+
raise click.UsageError(
|
|
617
|
+
"The --streaming-batch flag can only be used with --recursive."
|
|
618
|
+
)
|
|
619
|
+
|
|
594
620
|
|
|
595
621
|
def _validate_user_soft_delete_flags(
|
|
596
622
|
soft: bool, aspect: Optional[str], only_soft_deleted: bool
|
|
@@ -653,8 +679,8 @@ def _validate_user_aspect_flags(
|
|
|
653
679
|
def _validate_batch_size(batch_size: int) -> None:
|
|
654
680
|
if batch_size <= 0:
|
|
655
681
|
raise click.UsageError("Batch size must be a positive integer.")
|
|
656
|
-
elif batch_size >
|
|
657
|
-
raise click.UsageError("Batch size cannot exceed
|
|
682
|
+
elif batch_size > 5000:
|
|
683
|
+
raise click.UsageError("Batch size cannot exceed 5,000.")
|
|
658
684
|
|
|
659
685
|
|
|
660
686
|
def _delete_one_urn(
|
|
@@ -737,3 +763,76 @@ def _delete_one_urn(
|
|
|
737
763
|
num_timeseries_records=ts_rows_affected,
|
|
738
764
|
num_referenced_entities=referenced_entities_affected,
|
|
739
765
|
)
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _delete_urns_streaming_recursive(
|
|
769
|
+
graph: DataHubGraph,
|
|
770
|
+
parent_urn: str,
|
|
771
|
+
aspect_name: Optional[str],
|
|
772
|
+
soft: bool,
|
|
773
|
+
dry_run: bool,
|
|
774
|
+
start_time: Optional[datetime],
|
|
775
|
+
end_time: Optional[datetime],
|
|
776
|
+
workers: int,
|
|
777
|
+
soft_delete_filter: RemovedStatusFilter,
|
|
778
|
+
batch_size: int,
|
|
779
|
+
force: bool,
|
|
780
|
+
streaming_batch_size: int,
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Streaming recursive batch deletion that processes URNs in batches."""
|
|
783
|
+
|
|
784
|
+
entity_type = guess_entity_type(parent_urn)
|
|
785
|
+
click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
|
|
786
|
+
|
|
787
|
+
if not force and not dry_run:
|
|
788
|
+
click.confirm(
|
|
789
|
+
f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
|
|
790
|
+
abort=True,
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
urns = []
|
|
794
|
+
|
|
795
|
+
if entity_type == "dataPlatformInstance":
|
|
796
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
797
|
+
platform_instance=parent_urn,
|
|
798
|
+
status=soft_delete_filter,
|
|
799
|
+
batch_size=batch_size,
|
|
800
|
+
# Important to skip cache so we can resume from where we left off.
|
|
801
|
+
skip_cache=True,
|
|
802
|
+
)
|
|
803
|
+
else:
|
|
804
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
805
|
+
container=parent_urn,
|
|
806
|
+
status=soft_delete_filter,
|
|
807
|
+
batch_size=batch_size,
|
|
808
|
+
# Important to skip cache so we can resume from where we left off.
|
|
809
|
+
skip_cache=True,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
for child_urn in child_urns_iter:
|
|
813
|
+
urns.append(child_urn)
|
|
814
|
+
if len(urns) >= streaming_batch_size:
|
|
815
|
+
_delete_urns_parallel(
|
|
816
|
+
graph=graph,
|
|
817
|
+
urns=urns,
|
|
818
|
+
aspect_name=aspect_name,
|
|
819
|
+
soft=soft,
|
|
820
|
+
dry_run=dry_run,
|
|
821
|
+
delete_by_urn=False,
|
|
822
|
+
start_time=start_time,
|
|
823
|
+
end_time=end_time,
|
|
824
|
+
workers=workers,
|
|
825
|
+
)
|
|
826
|
+
urns = []
|
|
827
|
+
urns.append(parent_urn)
|
|
828
|
+
_delete_urns_parallel(
|
|
829
|
+
graph=graph,
|
|
830
|
+
urns=urns,
|
|
831
|
+
aspect_name=aspect_name,
|
|
832
|
+
soft=soft,
|
|
833
|
+
dry_run=dry_run,
|
|
834
|
+
delete_by_urn=False,
|
|
835
|
+
start_time=start_time,
|
|
836
|
+
end_time=end_time,
|
|
837
|
+
workers=workers,
|
|
838
|
+
)
|
datahub/cli/docker_check.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
import os
|
|
3
|
+
import pathlib
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass
|
|
5
|
-
from typing import Any, Dict, Iterator, List, Optional
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Set
|
|
6
7
|
|
|
7
8
|
import docker
|
|
8
9
|
import docker.errors
|
|
@@ -10,11 +11,13 @@ import docker.models.containers
|
|
|
10
11
|
import yaml
|
|
11
12
|
|
|
12
13
|
from datahub.configuration.common import ExceptionWithProps
|
|
14
|
+
from datahub.configuration.env_vars import get_compose_project_name
|
|
13
15
|
|
|
14
16
|
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
|
|
15
|
-
MIN_MEMORY_NEEDED = 3
|
|
17
|
+
MIN_MEMORY_NEEDED = 4.3 # GB
|
|
18
|
+
MIN_DISK_SPACE_NEEDED = 13 # GB
|
|
16
19
|
|
|
17
|
-
DOCKER_COMPOSE_PROJECT_NAME =
|
|
20
|
+
DOCKER_COMPOSE_PROJECT_NAME = get_compose_project_name()
|
|
18
21
|
DATAHUB_COMPOSE_PROJECT_FILTER = {
|
|
19
22
|
"label": f"com.docker.compose.project={DOCKER_COMPOSE_PROJECT_NAME}"
|
|
20
23
|
}
|
|
@@ -37,6 +40,10 @@ class DockerLowMemoryError(Exception):
|
|
|
37
40
|
SHOW_STACK_TRACE = False
|
|
38
41
|
|
|
39
42
|
|
|
43
|
+
class DockerLowDiskSpaceError(Exception):
|
|
44
|
+
SHOW_STACK_TRACE = False
|
|
45
|
+
|
|
46
|
+
|
|
40
47
|
class DockerComposeVersionError(Exception):
|
|
41
48
|
SHOW_STACK_TRACE = False
|
|
42
49
|
|
|
@@ -102,6 +109,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
|
|
|
102
109
|
"You can increase the memory allocated to Docker in the Docker settings."
|
|
103
110
|
)
|
|
104
111
|
|
|
112
|
+
result = client.containers.run(
|
|
113
|
+
"alpine:latest",
|
|
114
|
+
"sh -c \"df -B1 -P / | awk 'NR==2{print $2, $4}'\"", # total, available
|
|
115
|
+
remove=True,
|
|
116
|
+
stdout=True,
|
|
117
|
+
stderr=True,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
output = result.decode("utf-8").strip()
|
|
121
|
+
total_bytes, available_bytes = map(int, output.split())
|
|
122
|
+
|
|
123
|
+
available_gb = available_bytes / (1024**3)
|
|
124
|
+
if available_gb < MIN_DISK_SPACE_NEEDED:
|
|
125
|
+
raise DockerLowDiskSpaceError(
|
|
126
|
+
f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
|
|
127
|
+
"You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
|
|
128
|
+
)
|
|
129
|
+
|
|
105
130
|
|
|
106
131
|
class ContainerStatus(enum.Enum):
|
|
107
132
|
OK = "is ok"
|
|
@@ -126,10 +151,24 @@ class DockerContainerStatus:
|
|
|
126
151
|
@dataclass
|
|
127
152
|
class QuickstartStatus:
|
|
128
153
|
containers: List[DockerContainerStatus]
|
|
154
|
+
volumes: Set[str]
|
|
155
|
+
# On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
|
|
156
|
+
# While the check command can work, upgrades or
|
|
157
|
+
running_unsupported_version: bool
|
|
158
|
+
|
|
159
|
+
def __init__(
|
|
160
|
+
self,
|
|
161
|
+
containers: List[DockerContainerStatus],
|
|
162
|
+
volumes: List[str],
|
|
163
|
+
running_unsupported_version: bool = False,
|
|
164
|
+
):
|
|
165
|
+
self.containers = containers
|
|
166
|
+
self.running_unsupported_version = running_unsupported_version
|
|
167
|
+
self.volumes = set(volumes)
|
|
129
168
|
|
|
130
169
|
def errors(self) -> List[str]:
|
|
131
170
|
if not self.containers:
|
|
132
|
-
return ["
|
|
171
|
+
return ["datahub is not running"]
|
|
133
172
|
|
|
134
173
|
return [
|
|
135
174
|
f"{container.name} {container.status.value}"
|
|
@@ -176,6 +215,26 @@ class QuickstartStatus:
|
|
|
176
215
|
},
|
|
177
216
|
)
|
|
178
217
|
|
|
218
|
+
def get_containers(self) -> Set[str]:
|
|
219
|
+
if self.containers:
|
|
220
|
+
return {container.name for container in self.containers}
|
|
221
|
+
else:
|
|
222
|
+
return set()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
|
|
226
|
+
return "zookeeper" in containers
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _get_services_from_compose(compose_file: str) -> Set[str]:
|
|
230
|
+
with open(compose_file) as config_file:
|
|
231
|
+
return yaml.safe_load(config_file).get("services", {}).keys()
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _get_volumes_from_compose(compose_file: str) -> Set[str]:
|
|
235
|
+
with open(compose_file) as config_file:
|
|
236
|
+
return yaml.safe_load(config_file).get("volumes", {}).keys()
|
|
237
|
+
|
|
179
238
|
|
|
180
239
|
def check_docker_quickstart() -> QuickstartStatus:
|
|
181
240
|
container_statuses: List[DockerContainerStatus] = []
|
|
@@ -188,7 +247,7 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
188
247
|
ignore_removed=True,
|
|
189
248
|
)
|
|
190
249
|
if len(containers) == 0:
|
|
191
|
-
return QuickstartStatus([])
|
|
250
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
192
251
|
|
|
193
252
|
# load the expected containers from the docker-compose file
|
|
194
253
|
config_files = (
|
|
@@ -197,16 +256,17 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
197
256
|
.split(",")
|
|
198
257
|
)
|
|
199
258
|
|
|
200
|
-
# If using profiles, alternative check
|
|
259
|
+
# If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
|
|
201
260
|
if config_files and "/profiles/" in config_files[0]:
|
|
202
261
|
return check_docker_quickstart_profiles(client)
|
|
203
262
|
|
|
204
263
|
all_containers = set()
|
|
205
264
|
for config_file in config_files:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
265
|
+
all_containers.update(_get_services_from_compose(config_file))
|
|
266
|
+
|
|
267
|
+
all_volumes = set()
|
|
268
|
+
for config_file in config_files:
|
|
269
|
+
all_volumes.update(_get_volumes_from_compose(config_file))
|
|
210
270
|
|
|
211
271
|
existing_containers = set()
|
|
212
272
|
# Check that the containers are running and healthy.
|
|
@@ -240,8 +300,12 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
240
300
|
container_statuses.append(
|
|
241
301
|
DockerContainerStatus(missing, ContainerStatus.MISSING)
|
|
242
302
|
)
|
|
243
|
-
|
|
244
|
-
return QuickstartStatus(
|
|
303
|
+
running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
|
|
304
|
+
return QuickstartStatus(
|
|
305
|
+
containers=container_statuses,
|
|
306
|
+
volumes=list(all_volumes),
|
|
307
|
+
running_unsupported_version=running_unsupported_version,
|
|
308
|
+
)
|
|
245
309
|
|
|
246
310
|
|
|
247
311
|
def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
|
|
@@ -254,7 +318,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
254
318
|
ignore_removed=True,
|
|
255
319
|
)
|
|
256
320
|
if len(containers) == 0:
|
|
257
|
-
return QuickstartStatus([])
|
|
321
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
258
322
|
|
|
259
323
|
existing_containers = set()
|
|
260
324
|
# Check that the containers are running and healthy.
|
|
@@ -273,4 +337,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
273
337
|
|
|
274
338
|
container_statuses.append(DockerContainerStatus(name, status))
|
|
275
339
|
|
|
276
|
-
|
|
340
|
+
# TODO: Can this be handled with older verions?
|
|
341
|
+
return QuickstartStatus(
|
|
342
|
+
container_statuses, volumes=[], running_unsupported_version=False
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def check_upgrade_supported(
|
|
347
|
+
quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
|
|
348
|
+
) -> bool:
|
|
349
|
+
if (
|
|
350
|
+
quickstart_status.running_unsupported_version
|
|
351
|
+
): # we detected a legacy quickstart service
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
if not quickstart_status.get_containers(): # no containers are running
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
compose_services = set()
|
|
358
|
+
compose_volumes = set()
|
|
359
|
+
|
|
360
|
+
for compose_file in quickstart_compose_file:
|
|
361
|
+
compose_services.update(_get_services_from_compose(str(compose_file)))
|
|
362
|
+
compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
|
|
363
|
+
|
|
364
|
+
# if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
|
|
365
|
+
# We are checking for containers and volumes per the compose file, not necessarily all of them being present
|
|
366
|
+
if (
|
|
367
|
+
compose_services == quickstart_status.get_containers()
|
|
368
|
+
and compose_volumes == quickstart_status.volumes
|
|
369
|
+
):
|
|
370
|
+
return True
|
|
371
|
+
else:
|
|
372
|
+
return False
|