acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
|
|
|
12
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
13
|
from datahub.ingestion.api.source import SourceReport
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
-
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
15
|
+
from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
17
|
from datahub.utilities.stats_collections import TopKDict
|
|
18
18
|
from datahub.utilities.urns._urn_base import Urn
|
|
19
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
|
-
QUERY_ENTITIES = """
|
|
23
|
-
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
-
scrollAcrossEntities(input: $input) {
|
|
25
|
-
nextScrollId
|
|
26
|
-
count
|
|
27
|
-
searchResults {
|
|
28
|
-
entity {
|
|
29
|
-
... on QueryEntity {
|
|
30
|
-
urn
|
|
31
|
-
}
|
|
32
|
-
... on DataProcessInstance {
|
|
33
|
-
urn
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
23
|
|
|
42
24
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
43
25
|
enabled: bool = Field(
|
|
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
64
46
|
)
|
|
65
47
|
|
|
66
48
|
entity_types: Optional[List[str]] = Field(
|
|
67
|
-
default
|
|
49
|
+
# A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
|
|
50
|
+
default=[
|
|
51
|
+
"dataset",
|
|
52
|
+
"dashboard",
|
|
53
|
+
"chart",
|
|
54
|
+
"mlmodel",
|
|
55
|
+
"mlmodelGroup",
|
|
56
|
+
"mlfeatureTable",
|
|
57
|
+
"mlfeature",
|
|
58
|
+
"mlprimaryKey",
|
|
59
|
+
"dataFlow",
|
|
60
|
+
"dataJob",
|
|
61
|
+
"glossaryTerm",
|
|
62
|
+
"glossaryNode",
|
|
63
|
+
"tag",
|
|
64
|
+
"role",
|
|
65
|
+
"corpuser",
|
|
66
|
+
"corpGroup",
|
|
67
|
+
"container",
|
|
68
|
+
"domain",
|
|
69
|
+
"dataProduct",
|
|
70
|
+
"notebook",
|
|
71
|
+
"businessAttribute",
|
|
72
|
+
"schemaField",
|
|
73
|
+
"query",
|
|
74
|
+
"dataProcessInstance",
|
|
75
|
+
],
|
|
68
76
|
description="List of entity types to cleanup",
|
|
69
77
|
)
|
|
70
78
|
|
|
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
103
111
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
104
112
|
num_soft_deleted_entity_processed: int = 0
|
|
105
113
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
114
|
+
num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
|
|
115
|
+
default_factory=TopKDict
|
|
116
|
+
)
|
|
106
117
|
num_soft_deleted_entity_removal_started: int = 0
|
|
107
118
|
num_hard_deleted: int = 0
|
|
108
119
|
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
111
122
|
)
|
|
112
123
|
runtime_limit_reached: bool = False
|
|
113
124
|
deletion_limit_reached: bool = False
|
|
125
|
+
num_soft_deleted_entity_found: int = 0
|
|
126
|
+
num_soft_deleted_entity_invalid_urn: int = 0
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
133
146
|
self.config = config
|
|
134
147
|
self.report = report
|
|
135
148
|
self.dry_run = dry_run
|
|
136
|
-
self.start_time =
|
|
149
|
+
self.start_time = time.time()
|
|
137
150
|
self._report_lock: Lock = Lock()
|
|
138
151
|
self.last_print_time = 0.0
|
|
139
152
|
|
|
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
142
155
|
with self._report_lock:
|
|
143
156
|
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
144
157
|
|
|
158
|
+
def _increment_retained_by_type(self, type: str) -> None:
|
|
159
|
+
"""Thread-safe method to update report fields"""
|
|
160
|
+
with self._report_lock:
|
|
161
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
|
|
162
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
|
|
163
|
+
+ 1
|
|
164
|
+
)
|
|
165
|
+
|
|
145
166
|
def _increment_removal_started_count(self) -> None:
|
|
146
167
|
"""Thread-safe method to update report fields"""
|
|
147
168
|
with self._report_lock:
|
|
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
160
181
|
)
|
|
161
182
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
183
|
|
|
163
|
-
def delete_entity(self, urn:
|
|
184
|
+
def delete_entity(self, urn: Urn) -> None:
|
|
164
185
|
assert self.ctx.graph
|
|
165
186
|
|
|
166
|
-
entity_urn = Urn.from_string(urn)
|
|
167
187
|
if self.dry_run:
|
|
168
188
|
logger.info(
|
|
169
189
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
172
192
|
if self._deletion_limit_reached() or self._times_up():
|
|
173
193
|
return
|
|
174
194
|
self._increment_removal_started_count()
|
|
175
|
-
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
195
|
+
self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
|
|
176
196
|
self.ctx.graph.delete_references_to_urn(
|
|
177
|
-
urn=urn,
|
|
197
|
+
urn=urn.urn(),
|
|
178
198
|
dry_run=False,
|
|
179
199
|
)
|
|
180
|
-
self._update_report(urn,
|
|
200
|
+
self._update_report(urn.urn(), urn.entity_type)
|
|
181
201
|
|
|
182
|
-
def delete_soft_deleted_entity(self, urn:
|
|
202
|
+
def delete_soft_deleted_entity(self, urn: Urn) -> None:
|
|
183
203
|
assert self.ctx.graph
|
|
184
204
|
|
|
185
205
|
retention_time = (
|
|
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
187
207
|
- self.config.retention_days * 24 * 60 * 60
|
|
188
208
|
)
|
|
189
209
|
|
|
190
|
-
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
|
|
210
|
+
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
|
|
191
211
|
if "status" in aspect["aspects"]:
|
|
192
212
|
if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
|
|
193
213
|
"status"
|
|
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
196
216
|
self.delete_entity(urn)
|
|
197
217
|
else:
|
|
198
218
|
self._increment_retained_count()
|
|
219
|
+
self._increment_retained_by_type(urn.entity_type)
|
|
199
220
|
|
|
200
221
|
def _print_report(self) -> None:
|
|
201
222
|
time_taken = round(time.time() - self.last_print_time, 1)
|
|
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
204
225
|
self.last_print_time = time.time()
|
|
205
226
|
logger.info(f"\n{self.report.as_string()}")
|
|
206
227
|
|
|
207
|
-
def _process_futures(self, futures: Dict[Future,
|
|
228
|
+
def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
|
|
208
229
|
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
209
230
|
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
210
231
|
|
|
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
214
235
|
self.report.failure(
|
|
215
236
|
title="Failed to delete entity",
|
|
216
237
|
message="Failed to delete entity",
|
|
217
|
-
context=futures[future],
|
|
238
|
+
context=futures[future].urn(),
|
|
218
239
|
exc=future.exception(),
|
|
219
240
|
)
|
|
220
241
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
|
|
|
229
250
|
time.sleep(self.config.delay)
|
|
230
251
|
return futures
|
|
231
252
|
|
|
232
|
-
def
|
|
253
|
+
def _get_urns(self) -> Iterable[str]:
|
|
233
254
|
assert self.ctx.graph
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
-
# This will be remove in future version after server with fix has been
|
|
241
|
-
# around for a while
|
|
242
|
-
batch_size = 10
|
|
243
|
-
|
|
244
|
-
while True:
|
|
245
|
-
try:
|
|
246
|
-
if entity_type not in self.report.num_calls_made:
|
|
247
|
-
self.report.num_calls_made[entity_type] = 1
|
|
248
|
-
else:
|
|
249
|
-
self.report.num_calls_made[entity_type] += 1
|
|
250
|
-
self._print_report()
|
|
251
|
-
result = self.ctx.graph.execute_graphql(
|
|
252
|
-
graphql_query,
|
|
253
|
-
{
|
|
254
|
-
"input": {
|
|
255
|
-
"types": [entity_type],
|
|
256
|
-
"query": "*",
|
|
257
|
-
"scrollId": scroll_id if scroll_id else None,
|
|
258
|
-
"count": batch_size,
|
|
259
|
-
"orFilters": [
|
|
260
|
-
{
|
|
261
|
-
"and": [
|
|
262
|
-
{
|
|
263
|
-
"field": "removed",
|
|
264
|
-
"values": ["true"],
|
|
265
|
-
"condition": "EQUAL",
|
|
266
|
-
}
|
|
267
|
-
]
|
|
268
|
-
}
|
|
269
|
-
],
|
|
270
|
-
}
|
|
271
|
-
},
|
|
272
|
-
)
|
|
273
|
-
except Exception as e:
|
|
274
|
-
self.report.failure(
|
|
275
|
-
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
276
|
-
)
|
|
277
|
-
break
|
|
278
|
-
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
279
|
-
if not scroll_across_entities:
|
|
280
|
-
break
|
|
281
|
-
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
-
count = scroll_across_entities.get("count")
|
|
283
|
-
if not count or not search_results:
|
|
284
|
-
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
-
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
-
break
|
|
287
|
-
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
-
# Temp workaround. See note in beginning of the function
|
|
289
|
-
# We make the batch size = config after call has succeeded once
|
|
290
|
-
batch_size = self.config.batch_size
|
|
291
|
-
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
292
|
-
if entity_type not in self.report.num_entities_found:
|
|
293
|
-
self.report.num_entities_found[entity_type] = 0
|
|
294
|
-
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
-
"count"
|
|
255
|
+
# Entities created in the retention period are not considered for deletion
|
|
256
|
+
created_from = int(
|
|
257
|
+
(
|
|
258
|
+
datetime.now(timezone.utc).timestamp()
|
|
259
|
+
- self.config.retention_days * 24 * 60 * 60
|
|
296
260
|
)
|
|
297
|
-
|
|
298
|
-
|
|
261
|
+
* 1000
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
entity_types = self.config.entity_types
|
|
265
|
+
# dataProcessInstance is a special case where we need to get the entities separately
|
|
266
|
+
# because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
|
|
267
|
+
# Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
|
|
268
|
+
if (
|
|
269
|
+
self.config.entity_types
|
|
270
|
+
and "dataProcessInstance" in self.config.entity_types
|
|
271
|
+
):
|
|
272
|
+
entity_types = self.config.entity_types.copy()
|
|
273
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
274
|
+
entity_types=["dataProcessInstance"],
|
|
275
|
+
platform=self.config.platform,
|
|
276
|
+
env=self.config.env,
|
|
277
|
+
query=self.config.query,
|
|
278
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
279
|
+
batch_size=self.config.batch_size,
|
|
280
|
+
extraFilters=[
|
|
281
|
+
SearchFilterRule(
|
|
282
|
+
field="created",
|
|
283
|
+
condition="LESS_THAN",
|
|
284
|
+
values=[f"{created_from}"],
|
|
285
|
+
).to_raw()
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
entity_types.remove("dataProcessInstance")
|
|
299
290
|
|
|
300
|
-
def _get_urns(self) -> Iterable[str]:
|
|
301
|
-
assert self.ctx.graph
|
|
302
291
|
yield from self.ctx.graph.get_urns_by_filter(
|
|
303
|
-
entity_types=
|
|
292
|
+
entity_types=entity_types,
|
|
304
293
|
platform=self.config.platform,
|
|
305
294
|
env=self.config.env,
|
|
306
295
|
query=self.config.query,
|
|
307
296
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
308
297
|
batch_size=self.config.batch_size,
|
|
309
298
|
)
|
|
310
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
312
299
|
|
|
313
300
|
def _times_up(self) -> bool:
|
|
314
301
|
if (
|
|
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
335
322
|
return
|
|
336
323
|
self.start_time = time.time()
|
|
337
324
|
|
|
338
|
-
futures: Dict[Future,
|
|
325
|
+
futures: Dict[Future, Urn] = dict()
|
|
339
326
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
340
327
|
for urn in self._get_urns():
|
|
328
|
+
try:
|
|
329
|
+
self.report.num_soft_deleted_entity_found += 1
|
|
330
|
+
soft_deleted_urn = Urn.from_string(urn)
|
|
331
|
+
except InvalidUrnError as e:
|
|
332
|
+
logger.error(f"Failed to parse urn {urn} with error {e}")
|
|
333
|
+
self.report.num_soft_deleted_entity_invalid_urn += 1
|
|
334
|
+
continue
|
|
335
|
+
|
|
341
336
|
self._print_report()
|
|
342
337
|
while len(futures) >= self.config.futures_max_at_time:
|
|
343
338
|
futures = self._process_futures(futures)
|
|
344
339
|
if self._deletion_limit_reached() or self._times_up():
|
|
345
340
|
break
|
|
346
|
-
future = executor.submit(
|
|
347
|
-
|
|
341
|
+
future = executor.submit(
|
|
342
|
+
self.delete_soft_deleted_entity, soft_deleted_urn
|
|
343
|
+
)
|
|
344
|
+
futures[future] = soft_deleted_urn
|
|
348
345
|
|
|
349
346
|
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
350
347
|
while len(futures) > 0:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, Iterable, List, Optional
|
|
3
|
-
from urllib.parse import unquote
|
|
4
3
|
|
|
5
4
|
from pydantic import Field, SecretStr, validator
|
|
6
5
|
|
|
@@ -17,8 +16,12 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
18
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
19
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
20
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
21
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
22
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
23
|
+
create_object_store_adapter,
|
|
24
|
+
)
|
|
22
25
|
from datahub.ingestion.source.data_lake_common.path_spec import PathSpec, is_gcs_uri
|
|
23
26
|
from datahub.ingestion.source.s3.config import DataLakeSourceConfig
|
|
24
27
|
from datahub.ingestion.source.s3.report import DataLakeSourceReport
|
|
@@ -34,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
34
37
|
|
|
35
38
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
36
39
|
|
|
40
|
+
GCS_ENDPOINT_URL = "https://storage.googleapis.com"
|
|
41
|
+
|
|
37
42
|
|
|
38
43
|
class HMACKey(ConfigModel):
|
|
39
44
|
hmac_access_id: str = Field(description="Access ID")
|
|
@@ -80,7 +85,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
80
85
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
81
86
|
@config_class(GCSSourceConfig)
|
|
82
87
|
@support_status(SupportStatus.INCUBATING)
|
|
83
|
-
@capability(
|
|
88
|
+
@capability(
|
|
89
|
+
SourceCapability.CONTAINERS,
|
|
90
|
+
"Enabled by default",
|
|
91
|
+
subtype_modifier=[
|
|
92
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
93
|
+
SourceCapabilityModifier.FOLDER,
|
|
94
|
+
],
|
|
95
|
+
)
|
|
84
96
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
85
97
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
86
98
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -102,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
102
114
|
s3_config = DataLakeSourceConfig(
|
|
103
115
|
path_specs=s3_path_specs,
|
|
104
116
|
aws_config=AwsConnectionConfig(
|
|
105
|
-
aws_endpoint_url=
|
|
117
|
+
aws_endpoint_url=GCS_ENDPOINT_URL,
|
|
106
118
|
aws_access_key_id=self.config.credential.hmac_access_id,
|
|
107
119
|
aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
|
|
108
120
|
aws_region="auto",
|
|
@@ -110,15 +122,26 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
110
122
|
env=self.config.env,
|
|
111
123
|
max_rows=self.config.max_rows,
|
|
112
124
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
125
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
126
|
+
platform_instance=self.config.platform_instance,
|
|
113
127
|
)
|
|
114
128
|
return s3_config
|
|
115
129
|
|
|
116
130
|
def create_equivalent_s3_path_specs(self):
|
|
117
131
|
s3_path_specs = []
|
|
118
132
|
for path_spec in self.config.path_specs:
|
|
133
|
+
# PathSpec modifies the passed-in include to add /** to the end if
|
|
134
|
+
# autodetecting partitions. Remove that, otherwise creating a new
|
|
135
|
+
# PathSpec will complain.
|
|
136
|
+
# TODO: this should be handled inside PathSpec, which probably shouldn't
|
|
137
|
+
# modify its input.
|
|
138
|
+
include = path_spec.include
|
|
139
|
+
if include.endswith("{table}/**") and not path_spec.allow_double_stars:
|
|
140
|
+
include = include.removesuffix("**")
|
|
141
|
+
|
|
119
142
|
s3_path_specs.append(
|
|
120
143
|
PathSpec(
|
|
121
|
-
include=
|
|
144
|
+
include=include.replace("gs://", "s3://"),
|
|
122
145
|
exclude=(
|
|
123
146
|
[exc.replace("gs://", "s3://") for exc in path_spec.exclude]
|
|
124
147
|
if path_spec.exclude
|
|
@@ -129,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
129
152
|
table_name=path_spec.table_name,
|
|
130
153
|
enable_compression=path_spec.enable_compression,
|
|
131
154
|
sample_files=path_spec.sample_files,
|
|
155
|
+
allow_double_stars=path_spec.allow_double_stars,
|
|
156
|
+
autodetect_partitions=path_spec.autodetect_partitions,
|
|
157
|
+
include_hidden_folders=path_spec.include_hidden_folders,
|
|
158
|
+
tables_filter_pattern=path_spec.tables_filter_pattern,
|
|
159
|
+
traversal_method=path_spec.traversal_method,
|
|
132
160
|
)
|
|
133
161
|
)
|
|
134
162
|
|
|
@@ -136,16 +164,31 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
136
164
|
|
|
137
165
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
138
166
|
config = self.create_equivalent_s3_config()
|
|
139
|
-
|
|
167
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
168
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
169
|
+
s3_source = S3Source(config, s3_ctx)
|
|
170
|
+
return self.s3_source_overrides(s3_source)
|
|
140
171
|
|
|
141
172
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|
|
142
|
-
|
|
173
|
+
"""
|
|
174
|
+
Override S3Source methods with GCS-specific implementations using the adapter pattern.
|
|
143
175
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
176
|
+
This method customizes the S3Source instance to behave like a GCS source by
|
|
177
|
+
applying the GCS-specific adapter that replaces the necessary functionality.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
source: The S3Source instance to customize
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The modified S3Source instance with GCS behavior
|
|
184
|
+
"""
|
|
185
|
+
# Create a GCS adapter with project ID and region from our config
|
|
186
|
+
adapter = create_object_store_adapter(
|
|
187
|
+
"gcs",
|
|
147
188
|
)
|
|
148
|
-
|
|
189
|
+
|
|
190
|
+
# Apply all customizations to the source
|
|
191
|
+
return adapter.apply_customizations(source)
|
|
149
192
|
|
|
150
193
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
151
194
|
return [
|
|
@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def is_gcs_uri(uri: str) -> bool:
|
|
7
|
+
"""
|
|
8
|
+
Check if a URI is a GCS URI (starts with gs://).
|
|
9
|
+
|
|
10
|
+
For more general URI handling, consider using object_store.get_object_store_for_uri.
|
|
11
|
+
"""
|
|
7
12
|
return uri.startswith(GCS_PREFIX)
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
|
|
16
|
+
"""
|
|
17
|
+
Get the GCS prefix (gs://) if the URI is a GCS URI.
|
|
18
|
+
|
|
19
|
+
For more general URI handling, consider using object_store.get_object_store_for_uri.
|
|
20
|
+
"""
|
|
11
21
|
if gcs_uri.startswith(GCS_PREFIX):
|
|
12
22
|
return GCS_PREFIX
|
|
13
23
|
return None
|
|
14
24
|
|
|
15
25
|
|
|
16
26
|
def strip_gcs_prefix(gcs_uri: str) -> str:
|
|
17
|
-
|
|
27
|
+
"""
|
|
28
|
+
Remove the GCS prefix (gs://) from a GCS URI.
|
|
29
|
+
|
|
30
|
+
For more general URI handling, consider using the object_store module.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
gcs_uri: A GCS URI starting with gs://
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The URI without the gs:// prefix
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the URI doesn't start with gs://
|
|
40
|
+
"""
|
|
18
41
|
prefix = get_gcs_prefix(gcs_uri)
|
|
19
42
|
if not prefix:
|
|
20
|
-
raise ValueError(f"Not
|
|
43
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
|
|
21
44
|
|
|
22
45
|
return gcs_uri[len(GCS_PREFIX) :]
|
|
23
46
|
|
|
24
47
|
|
|
25
|
-
def get_gcs_bucket_name(path):
|
|
26
|
-
if not is_gcs_uri(path):
|
|
27
|
-
raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
|
|
28
|
-
return strip_gcs_prefix(path).split("/")[0]
|
|
29
|
-
|
|
30
|
-
|
|
31
48
|
def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Get the path relative to the bucket from a GCS URI.
|
|
51
|
+
|
|
52
|
+
For more general URI handling, consider using object_store.get_object_key.
|
|
53
|
+
"""
|
|
32
54
|
return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
|
|
33
55
|
|
|
34
56
|
|
|
35
57
|
def get_gcs_key_prefix(gcs_uri: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Get the key prefix (first path component after bucket) from a GCS URI.
|
|
60
|
+
|
|
61
|
+
For more general URI handling, consider using object_store.get_object_key.
|
|
62
|
+
"""
|
|
36
63
|
if not is_gcs_uri(gcs_uri):
|
|
37
|
-
raise ValueError(f"Not a GCS URI. Must start with
|
|
64
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
|
|
38
65
|
return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]
|