acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,9 @@ from pydantic import validator
|
|
|
25
25
|
from pydantic.fields import Field
|
|
26
26
|
|
|
27
27
|
from datahub.api.entities.dataset.dataset import Dataset
|
|
28
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
29
|
+
LakeFormationTag,
|
|
30
|
+
)
|
|
28
31
|
from datahub.configuration.common import AllowDenyPattern
|
|
29
32
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
30
33
|
from datahub.emitter import mce_builder
|
|
@@ -57,14 +60,21 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
|
57
60
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
58
61
|
from datahub.ingestion.source.aws import s3_util
|
|
59
62
|
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
|
|
63
|
+
from datahub.ingestion.source.aws.platform_resource_repository import (
|
|
64
|
+
GluePlatformResourceRepository,
|
|
65
|
+
)
|
|
60
66
|
from datahub.ingestion.source.aws.s3_util import (
|
|
61
67
|
is_s3_uri,
|
|
62
68
|
make_s3_urn,
|
|
63
69
|
make_s3_urn_for_lineage,
|
|
64
70
|
)
|
|
71
|
+
from datahub.ingestion.source.aws.tag_entities import (
|
|
72
|
+
LakeFormationTagPlatformResourceId,
|
|
73
|
+
)
|
|
65
74
|
from datahub.ingestion.source.common.subtypes import (
|
|
66
75
|
DatasetContainerSubTypes,
|
|
67
76
|
DatasetSubTypes,
|
|
77
|
+
SourceCapabilityModifier,
|
|
68
78
|
)
|
|
69
79
|
from datahub.ingestion.source.glue_profiling_config import GlueProfilingConfig
|
|
70
80
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -114,6 +124,7 @@ from datahub.metadata.schema_classes import (
|
|
|
114
124
|
from datahub.utilities.delta import delta_type_to_hive_type
|
|
115
125
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
116
126
|
from datahub.utilities.lossy_collections import LossyList
|
|
127
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
117
128
|
|
|
118
129
|
logger = logging.getLogger(__name__)
|
|
119
130
|
|
|
@@ -168,6 +179,12 @@ class GlueSourceConfig(
|
|
|
168
179
|
default=False,
|
|
169
180
|
description="If an S3 Objects Tags should be created for the Tables ingested by Glue.",
|
|
170
181
|
)
|
|
182
|
+
|
|
183
|
+
extract_lakeformation_tags: Optional[bool] = Field(
|
|
184
|
+
default=False,
|
|
185
|
+
description="When True, extracts Lake Formation tags directly assigned to Glue tables/databases. Note: Tags inherited from databases or other parent resources are excluded.",
|
|
186
|
+
)
|
|
187
|
+
|
|
171
188
|
profiling: GlueProfilingConfig = Field(
|
|
172
189
|
default_factory=GlueProfilingConfig,
|
|
173
190
|
description="Configs to ingest data profiles from glue table",
|
|
@@ -176,6 +193,7 @@ class GlueSourceConfig(
|
|
|
176
193
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
177
194
|
default=None, description=""
|
|
178
195
|
)
|
|
196
|
+
|
|
179
197
|
extract_delta_schema_from_parameters: Optional[bool] = Field(
|
|
180
198
|
default=False,
|
|
181
199
|
description="If enabled, delta schemas can be alternatively fetched from table parameters.",
|
|
@@ -199,6 +217,10 @@ class GlueSourceConfig(
|
|
|
199
217
|
def s3_client(self):
|
|
200
218
|
return self.get_s3_client()
|
|
201
219
|
|
|
220
|
+
@property
|
|
221
|
+
def lakeformation_client(self):
|
|
222
|
+
return self.get_lakeformation_client()
|
|
223
|
+
|
|
202
224
|
@validator("glue_s3_lineage_direction")
|
|
203
225
|
def check_direction(cls, v: str) -> str:
|
|
204
226
|
if v.lower() not in ["upstream", "downstream"]:
|
|
@@ -247,12 +269,19 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
247
269
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
248
270
|
@capability(
|
|
249
271
|
SourceCapability.DELETION_DETECTION,
|
|
250
|
-
"Enabled by default
|
|
272
|
+
"Enabled by default via stateful ingestion.",
|
|
251
273
|
)
|
|
252
274
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
253
275
|
@capability(
|
|
254
276
|
SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
|
|
255
277
|
)
|
|
278
|
+
@capability(
|
|
279
|
+
SourceCapability.CONTAINERS,
|
|
280
|
+
"Enabled by default",
|
|
281
|
+
subtype_modifier=[
|
|
282
|
+
SourceCapabilityModifier.DATABASE,
|
|
283
|
+
],
|
|
284
|
+
)
|
|
256
285
|
class GlueSource(StatefulIngestionSourceBase):
|
|
257
286
|
"""
|
|
258
287
|
Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
|
|
@@ -311,6 +340,8 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
311
340
|
source_config: GlueSourceConfig
|
|
312
341
|
report: GlueSourceReport
|
|
313
342
|
|
|
343
|
+
lf_tag_cache: Dict[str, Dict[str, List[str]]] = {}
|
|
344
|
+
|
|
314
345
|
def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
|
|
315
346
|
super().__init__(config, ctx)
|
|
316
347
|
self.ctx = ctx
|
|
@@ -320,9 +351,118 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
320
351
|
self.report.catalog_id = self.source_config.catalog_id
|
|
321
352
|
self.glue_client = config.glue_client
|
|
322
353
|
self.s3_client = config.s3_client
|
|
354
|
+
# Initialize Lake Formation client
|
|
355
|
+
self.lf_client = config.lakeformation_client
|
|
323
356
|
self.extract_transforms = config.extract_transforms
|
|
324
357
|
self.env = config.env
|
|
325
358
|
|
|
359
|
+
self.platform_resource_repository: Optional[
|
|
360
|
+
"GluePlatformResourceRepository"
|
|
361
|
+
] = None
|
|
362
|
+
if self.ctx.graph:
|
|
363
|
+
self.platform_resource_repository = GluePlatformResourceRepository(
|
|
364
|
+
self.ctx.graph,
|
|
365
|
+
platform_instance=self.source_config.platform_instance,
|
|
366
|
+
catalog=self.source_config.catalog_id,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def get_database_lf_tags(
|
|
370
|
+
self,
|
|
371
|
+
catalog_id: str,
|
|
372
|
+
database_name: str,
|
|
373
|
+
) -> List[LakeFormationTag]:
|
|
374
|
+
"""Get all LF tags for a specific table."""
|
|
375
|
+
try:
|
|
376
|
+
# Get LF tags for the specified table
|
|
377
|
+
response = self.lf_client.get_resource_lf_tags(
|
|
378
|
+
CatalogId=catalog_id,
|
|
379
|
+
Resource={
|
|
380
|
+
"Database": {
|
|
381
|
+
"CatalogId": catalog_id,
|
|
382
|
+
"Name": database_name,
|
|
383
|
+
}
|
|
384
|
+
},
|
|
385
|
+
ShowAssignedLFTags=True,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if response:
|
|
389
|
+
logger.info(f"LF tags for database {database_name}: {response}")
|
|
390
|
+
# Extract and return the LF tags
|
|
391
|
+
lf_tags = response.get("LFTagOnDatabase", [])
|
|
392
|
+
|
|
393
|
+
tags = []
|
|
394
|
+
for lf_tag in lf_tags:
|
|
395
|
+
catalog_id = lf_tag.get("CatalogId")
|
|
396
|
+
tag_key = lf_tag.get("TagKey")
|
|
397
|
+
for tag_value in lf_tag.get("TagValues", []):
|
|
398
|
+
t = LakeFormationTag(
|
|
399
|
+
key=tag_key,
|
|
400
|
+
value=tag_value,
|
|
401
|
+
catalog=catalog_id,
|
|
402
|
+
)
|
|
403
|
+
tags.append(t)
|
|
404
|
+
return tags
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
print(
|
|
408
|
+
f"Error getting LF tags for table {catalog_id}.{database_name}: {str(e)}"
|
|
409
|
+
)
|
|
410
|
+
return []
|
|
411
|
+
|
|
412
|
+
def get_table_lf_tags(
|
|
413
|
+
self,
|
|
414
|
+
catalog_id: str,
|
|
415
|
+
database_name: str,
|
|
416
|
+
table_name: str,
|
|
417
|
+
) -> List[LakeFormationTag]:
|
|
418
|
+
"""Get all LF tags for a specific table."""
|
|
419
|
+
try:
|
|
420
|
+
# Get LF tags for the specified table
|
|
421
|
+
response = self.lf_client.get_resource_lf_tags(
|
|
422
|
+
CatalogId=catalog_id,
|
|
423
|
+
Resource={
|
|
424
|
+
"Table": {
|
|
425
|
+
"CatalogId": catalog_id,
|
|
426
|
+
"DatabaseName": database_name,
|
|
427
|
+
"Name": table_name,
|
|
428
|
+
},
|
|
429
|
+
},
|
|
430
|
+
ShowAssignedLFTags=True,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Extract and return the LF tags
|
|
434
|
+
lf_tags = response.get("LFTagsOnTable", [])
|
|
435
|
+
|
|
436
|
+
tags = []
|
|
437
|
+
for lf_tag in lf_tags:
|
|
438
|
+
catalog_id = lf_tag.get("CatalogId")
|
|
439
|
+
tag_key = lf_tag.get("TagKey")
|
|
440
|
+
for tag_value in lf_tag.get("TagValues", []):
|
|
441
|
+
t = LakeFormationTag(
|
|
442
|
+
key=tag_key,
|
|
443
|
+
value=tag_value,
|
|
444
|
+
catalog=catalog_id,
|
|
445
|
+
)
|
|
446
|
+
tags.append(t)
|
|
447
|
+
return tags
|
|
448
|
+
|
|
449
|
+
except Exception:
|
|
450
|
+
return []
|
|
451
|
+
|
|
452
|
+
def get_all_lf_tags(self) -> List:
|
|
453
|
+
# 1. Get all LF-Tags in your account (metadata only)
|
|
454
|
+
response = self.lf_client.list_lf_tags(
|
|
455
|
+
MaxResults=50 # Adjust as needed
|
|
456
|
+
)
|
|
457
|
+
all_lf_tags = response["LFTags"]
|
|
458
|
+
# Continue pagination if necessary
|
|
459
|
+
while "NextToken" in response:
|
|
460
|
+
response = self.lf_client.list_lf_tags(
|
|
461
|
+
NextToken=response["NextToken"], MaxResults=50
|
|
462
|
+
)
|
|
463
|
+
all_lf_tags.extend(response["LFTags"])
|
|
464
|
+
return all_lf_tags
|
|
465
|
+
|
|
326
466
|
def get_glue_arn(
|
|
327
467
|
self, account_id: str, database: str, table: Optional[str] = None
|
|
328
468
|
) -> str:
|
|
@@ -385,6 +525,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
385
525
|
bucket = url.netloc
|
|
386
526
|
key = url.path[1:]
|
|
387
527
|
|
|
528
|
+
# validate that we have a non-empty key
|
|
529
|
+
if not key:
|
|
530
|
+
self.report.num_job_script_location_invalid += 1
|
|
531
|
+
logger.warning(
|
|
532
|
+
f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path for flow urn: {flow_urn}."
|
|
533
|
+
)
|
|
534
|
+
return None
|
|
535
|
+
|
|
388
536
|
# download the script contents
|
|
389
537
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
|
|
390
538
|
try:
|
|
@@ -396,6 +544,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
396
544
|
)
|
|
397
545
|
self.report.num_job_script_failed_download += 1
|
|
398
546
|
return None
|
|
547
|
+
except botocore.exceptions.ParamValidationError as e:
|
|
548
|
+
self.report_warning(
|
|
549
|
+
flow_urn,
|
|
550
|
+
f"Invalid S3 path for Glue job script {script_path}: {e}",
|
|
551
|
+
)
|
|
552
|
+
self.report.num_job_script_location_invalid += 1
|
|
553
|
+
return None
|
|
554
|
+
|
|
399
555
|
script = obj["Body"].read().decode("utf-8")
|
|
400
556
|
|
|
401
557
|
try:
|
|
@@ -869,7 +1025,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
869
1025
|
table_stats: dict,
|
|
870
1026
|
column_stats: dict,
|
|
871
1027
|
partition_spec: Optional[str] = None,
|
|
872
|
-
) -> MetadataChangeProposalWrapper:
|
|
1028
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
873
1029
|
assert self.source_config.profiling
|
|
874
1030
|
|
|
875
1031
|
# instantiate profile class
|
|
@@ -936,6 +1092,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
936
1092
|
|
|
937
1093
|
dataset_profile.fieldProfiles.append(column_profile)
|
|
938
1094
|
|
|
1095
|
+
# if no stats are available, skip ingestion
|
|
1096
|
+
if (
|
|
1097
|
+
not dataset_profile.fieldProfiles
|
|
1098
|
+
and dataset_profile.rowCount is None
|
|
1099
|
+
and dataset_profile.columnCount is None
|
|
1100
|
+
):
|
|
1101
|
+
return None
|
|
1102
|
+
|
|
939
1103
|
if partition_spec:
|
|
940
1104
|
# inject partition level stats
|
|
941
1105
|
dataset_profile.partitionSpec = PartitionSpecClass(
|
|
@@ -990,18 +1154,20 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
990
1154
|
if self.source_config.profiling.partition_patterns.allowed(
|
|
991
1155
|
partition_spec
|
|
992
1156
|
):
|
|
993
|
-
|
|
1157
|
+
profile_mcp = self._create_profile_mcp(
|
|
994
1158
|
mce, table_stats, column_stats, partition_spec
|
|
995
|
-
)
|
|
1159
|
+
)
|
|
1160
|
+
if profile_mcp:
|
|
1161
|
+
yield profile_mcp.as_workunit()
|
|
996
1162
|
else:
|
|
997
1163
|
continue
|
|
998
1164
|
else:
|
|
999
1165
|
# ingest data profile without partition
|
|
1000
1166
|
table_stats = response["Table"]["Parameters"]
|
|
1001
1167
|
column_stats = response["Table"]["StorageDescriptor"]["Columns"]
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1168
|
+
profile_mcp = self._create_profile_mcp(mce, table_stats, column_stats)
|
|
1169
|
+
if profile_mcp:
|
|
1170
|
+
yield profile_mcp.as_workunit()
|
|
1005
1171
|
|
|
1006
1172
|
def gen_database_key(self, database: str) -> DatabaseKey:
|
|
1007
1173
|
return DatabaseKey(
|
|
@@ -1012,9 +1178,67 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1012
1178
|
backcompat_env_as_instance=True,
|
|
1013
1179
|
)
|
|
1014
1180
|
|
|
1181
|
+
def gen_platform_resource(
|
|
1182
|
+
self, tag: LakeFormationTag
|
|
1183
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1184
|
+
if self.ctx.graph and self.platform_resource_repository:
|
|
1185
|
+
platform_resource_id = (
|
|
1186
|
+
LakeFormationTagPlatformResourceId.get_or_create_from_tag(
|
|
1187
|
+
tag=tag,
|
|
1188
|
+
platform_resource_repository=self.platform_resource_repository,
|
|
1189
|
+
catalog_id=tag.catalog,
|
|
1190
|
+
)
|
|
1191
|
+
)
|
|
1192
|
+
logger.info(f"Created platform resource {platform_resource_id}")
|
|
1193
|
+
|
|
1194
|
+
lf_tag = self.platform_resource_repository.get_entity_from_datahub(
|
|
1195
|
+
platform_resource_id, False
|
|
1196
|
+
)
|
|
1197
|
+
if (
|
|
1198
|
+
tag.to_datahub_tag_urn().urn()
|
|
1199
|
+
not in lf_tag.datahub_linked_resources().urns
|
|
1200
|
+
):
|
|
1201
|
+
try:
|
|
1202
|
+
lf_tag.datahub_linked_resources().add(
|
|
1203
|
+
tag.to_datahub_tag_urn().urn()
|
|
1204
|
+
)
|
|
1205
|
+
platform_resource = lf_tag.as_platform_resource()
|
|
1206
|
+
for mcp in platform_resource.to_mcps():
|
|
1207
|
+
yield MetadataWorkUnit(
|
|
1208
|
+
id=f"platform_resource-{platform_resource.id}",
|
|
1209
|
+
mcp=mcp,
|
|
1210
|
+
)
|
|
1211
|
+
except Exception as e:
|
|
1212
|
+
logger.warning(
|
|
1213
|
+
f"Failed to create platform resource for tag {tag}: {e}",
|
|
1214
|
+
exc_info=True,
|
|
1215
|
+
)
|
|
1216
|
+
self.report.report_warning(
|
|
1217
|
+
context="Failed to create platform resource",
|
|
1218
|
+
message=f"Failed to create platform resource for Tag: {tag}",
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1015
1221
|
def gen_database_containers(
|
|
1016
1222
|
self, database: Mapping[str, Any]
|
|
1017
1223
|
) -> Iterable[MetadataWorkUnit]:
|
|
1224
|
+
container_tags: Optional[List] = None
|
|
1225
|
+
if self.source_config.extract_lakeformation_tags:
|
|
1226
|
+
try:
|
|
1227
|
+
tags = self.get_database_lf_tags(
|
|
1228
|
+
catalog_id=database["CatalogId"], database_name=database["Name"]
|
|
1229
|
+
)
|
|
1230
|
+
container_tags = []
|
|
1231
|
+
for tag in tags:
|
|
1232
|
+
try:
|
|
1233
|
+
container_tags.append(tag.to_datahub_tag_urn().name)
|
|
1234
|
+
yield from self.gen_platform_resource(tag)
|
|
1235
|
+
except InvalidUrnError:
|
|
1236
|
+
continue
|
|
1237
|
+
except Exception:
|
|
1238
|
+
self.report_warning(
|
|
1239
|
+
reason="Failed to extract Lake Formation tags for database",
|
|
1240
|
+
key=database["Name"],
|
|
1241
|
+
)
|
|
1018
1242
|
domain_urn = self._gen_domain_urn(database["Name"])
|
|
1019
1243
|
database_container_key = self.gen_database_key(database["Name"])
|
|
1020
1244
|
parameters = database.get("Parameters", {})
|
|
@@ -1032,6 +1256,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1032
1256
|
qualified_name=self.get_glue_arn(
|
|
1033
1257
|
account_id=database["CatalogId"], database=database["Name"]
|
|
1034
1258
|
),
|
|
1259
|
+
tags=container_tags,
|
|
1035
1260
|
extra_properties=parameters,
|
|
1036
1261
|
)
|
|
1037
1262
|
|
|
@@ -1106,9 +1331,8 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1106
1331
|
platform_instance=self.source_config.platform_instance,
|
|
1107
1332
|
)
|
|
1108
1333
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1334
|
+
yield from self._extract_record(dataset_urn, table, full_table_name)
|
|
1335
|
+
# generate a Dataset snapshot
|
|
1112
1336
|
# We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
|
|
1113
1337
|
# possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
|
|
1114
1338
|
yield MetadataChangeProposalWrapper(
|
|
@@ -1124,19 +1348,6 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1124
1348
|
dataset_urn=dataset_urn, db_name=database_name
|
|
1125
1349
|
)
|
|
1126
1350
|
|
|
1127
|
-
wu = self.get_lineage_if_enabled(mce)
|
|
1128
|
-
if wu:
|
|
1129
|
-
yield wu
|
|
1130
|
-
|
|
1131
|
-
try:
|
|
1132
|
-
yield from self.get_profile_if_enabled(mce, database_name, table_name)
|
|
1133
|
-
except KeyError as e:
|
|
1134
|
-
self.report.report_failure(
|
|
1135
|
-
message="Failed to extract profile for table",
|
|
1136
|
-
context=f"Table: {dataset_urn}",
|
|
1137
|
-
exc=e,
|
|
1138
|
-
)
|
|
1139
|
-
|
|
1140
1351
|
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
|
|
1141
1352
|
dags: Dict[str, Optional[Dict[str, Any]]] = {}
|
|
1142
1353
|
flow_names: Dict[str, str] = {}
|
|
@@ -1191,159 +1402,201 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1191
1402
|
for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces):
|
|
1192
1403
|
yield MetadataWorkUnit(id=dataset_id, mce=dataset_mce)
|
|
1193
1404
|
|
|
1194
|
-
# flake8: noqa: C901
|
|
1195
1405
|
def _extract_record(
|
|
1196
1406
|
self, dataset_urn: str, table: Dict, table_name: str
|
|
1197
|
-
) ->
|
|
1407
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1408
|
+
"""Extract and yield metadata work units for a Glue table."""
|
|
1198
1409
|
logger.debug(
|
|
1199
1410
|
f"extract record from table={table_name} for dataset={dataset_urn}"
|
|
1200
1411
|
)
|
|
1201
1412
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
name=table["Name"],
|
|
1216
|
-
qualifiedName=self.get_glue_arn(
|
|
1217
|
-
account_id=table["CatalogId"],
|
|
1218
|
-
database=table["DatabaseName"],
|
|
1219
|
-
table=table["Name"],
|
|
1220
|
-
),
|
|
1221
|
-
)
|
|
1413
|
+
# Create the main dataset snapshot
|
|
1414
|
+
dataset_snapshot = DatasetSnapshot(
|
|
1415
|
+
urn=dataset_urn,
|
|
1416
|
+
aspects=[
|
|
1417
|
+
Status(removed=False),
|
|
1418
|
+
self._get_dataset_properties(table),
|
|
1419
|
+
],
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
# Add schema metadata if available
|
|
1423
|
+
schema_metadata = self._get_schema_metadata(table, table_name, dataset_urn)
|
|
1424
|
+
if schema_metadata:
|
|
1425
|
+
dataset_snapshot.aspects.append(schema_metadata)
|
|
1222
1426
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1427
|
+
# Add platform instance
|
|
1428
|
+
dataset_snapshot.aspects.append(self._get_data_platform_instance())
|
|
1429
|
+
|
|
1430
|
+
# Add ownership if enabled
|
|
1431
|
+
if self.extract_owners:
|
|
1432
|
+
ownership = GlueSource._get_ownership(table.get("Owner"))
|
|
1433
|
+
if ownership:
|
|
1434
|
+
dataset_snapshot.aspects.append(ownership)
|
|
1435
|
+
|
|
1436
|
+
# Add S3 tags if enabled
|
|
1437
|
+
s3_tags = self._get_s3_tags(table, dataset_urn)
|
|
1438
|
+
if s3_tags:
|
|
1439
|
+
dataset_snapshot.aspects.append(s3_tags)
|
|
1440
|
+
|
|
1441
|
+
# Add Lake Formation tags if enabled
|
|
1442
|
+
if self.source_config.extract_lakeformation_tags:
|
|
1443
|
+
tags = self.get_table_lf_tags(
|
|
1444
|
+
catalog_id=table["CatalogId"],
|
|
1445
|
+
database_name=table["DatabaseName"],
|
|
1446
|
+
table_name=table["Name"],
|
|
1230
1447
|
)
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
tags_to_add.extend(
|
|
1253
|
-
[
|
|
1254
|
-
make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
|
|
1255
|
-
for tag in tag_set
|
|
1256
|
-
]
|
|
1257
|
-
)
|
|
1258
|
-
else:
|
|
1259
|
-
# Unlike bucket tags, if an object does not have tags, it will just return an empty array
|
|
1260
|
-
# as opposed to an exception.
|
|
1261
|
-
logger.warning(
|
|
1262
|
-
f"No tags found for bucket={bucket_name} key={key_prefix}"
|
|
1263
|
-
)
|
|
1264
|
-
if len(tags_to_add) == 0:
|
|
1265
|
-
return None
|
|
1266
|
-
if self.ctx.graph is not None:
|
|
1267
|
-
logger.debug(
|
|
1268
|
-
"Connected to DatahubApi, grabbing current tags to maintain."
|
|
1269
|
-
)
|
|
1270
|
-
current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
|
|
1271
|
-
entity_urn=dataset_urn,
|
|
1272
|
-
aspect_type=GlobalTagsClass,
|
|
1273
|
-
)
|
|
1274
|
-
if current_tags:
|
|
1275
|
-
tags_to_add.extend(
|
|
1276
|
-
[current_tag.tag for current_tag in current_tags.tags]
|
|
1277
|
-
)
|
|
1278
|
-
else:
|
|
1279
|
-
logger.warning(
|
|
1280
|
-
"Could not connect to DatahubApi. No current tags to maintain"
|
|
1281
|
-
)
|
|
1282
|
-
# Remove duplicate tags
|
|
1283
|
-
tags_to_add = sorted(list(set(tags_to_add)))
|
|
1284
|
-
new_tags = GlobalTagsClass(
|
|
1285
|
-
tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add]
|
|
1448
|
+
|
|
1449
|
+
global_tags = self._get_lake_formation_tags(tags)
|
|
1450
|
+
if global_tags:
|
|
1451
|
+
dataset_snapshot.aspects.append(global_tags)
|
|
1452
|
+
# Generate platform resources for LF tags
|
|
1453
|
+
for tag in tags:
|
|
1454
|
+
yield from self.gen_platform_resource(tag)
|
|
1455
|
+
|
|
1456
|
+
# Create and yield the main metadata work unit
|
|
1457
|
+
metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1458
|
+
yield MetadataWorkUnit(table_name, mce=metadata_record)
|
|
1459
|
+
|
|
1460
|
+
# Add lineage if enabled
|
|
1461
|
+
lineage_wu = self.get_lineage_if_enabled(metadata_record)
|
|
1462
|
+
if lineage_wu:
|
|
1463
|
+
yield lineage_wu
|
|
1464
|
+
|
|
1465
|
+
# Add profile if enabled
|
|
1466
|
+
try:
|
|
1467
|
+
yield from self.get_profile_if_enabled(
|
|
1468
|
+
metadata_record, table["DatabaseName"], table["Name"]
|
|
1286
1469
|
)
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
return (
|
|
1293
|
-
(self.source_config.extract_delta_schema_from_parameters is True)
|
|
1294
|
-
and (provider == "delta")
|
|
1295
|
-
and (num_parts > 0)
|
|
1296
|
-
and (columns is not None)
|
|
1297
|
-
and (len(columns) == 1)
|
|
1298
|
-
and (columns[0].get("Name", "") == "col")
|
|
1299
|
-
and (columns[0].get("Type", "") == "array<string>")
|
|
1470
|
+
except KeyError as e:
|
|
1471
|
+
self.report.report_failure(
|
|
1472
|
+
message="Failed to extract profile for table",
|
|
1473
|
+
context=f"Table: {dataset_urn}",
|
|
1474
|
+
exc=e,
|
|
1300
1475
|
)
|
|
1301
1476
|
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1477
|
+
def _get_dataset_properties(self, table: Dict) -> DatasetPropertiesClass:
|
|
1478
|
+
"""Extract dataset properties from Glue table."""
|
|
1479
|
+
storage_descriptor = table.get("StorageDescriptor", {})
|
|
1480
|
+
custom_properties = {
|
|
1481
|
+
**table.get("Parameters", {}),
|
|
1482
|
+
**{
|
|
1483
|
+
k: str(v)
|
|
1484
|
+
for k, v in storage_descriptor.items()
|
|
1485
|
+
if k not in ["Columns", "Parameters"]
|
|
1486
|
+
},
|
|
1487
|
+
}
|
|
1313
1488
|
|
|
1314
|
-
|
|
1315
|
-
|
|
1489
|
+
return DatasetPropertiesClass(
|
|
1490
|
+
description=table.get("Description"),
|
|
1491
|
+
customProperties=custom_properties,
|
|
1492
|
+
uri=table.get("Location"),
|
|
1493
|
+
tags=[],
|
|
1494
|
+
name=table["Name"],
|
|
1495
|
+
qualifiedName=self.get_glue_arn(
|
|
1496
|
+
account_id=table["CatalogId"],
|
|
1497
|
+
database=table["DatabaseName"],
|
|
1498
|
+
table=table["Name"],
|
|
1499
|
+
),
|
|
1500
|
+
)
|
|
1316
1501
|
|
|
1317
|
-
|
|
1318
|
-
|
|
1502
|
+
def _get_schema_metadata(
|
|
1503
|
+
self, table: Dict, table_name: str, dataset_urn: str
|
|
1504
|
+
) -> Optional[SchemaMetadata]:
|
|
1505
|
+
"""Extract schema metadata from Glue table."""
|
|
1506
|
+
if not table.get("StorageDescriptor"):
|
|
1507
|
+
return None
|
|
1319
1508
|
|
|
1320
|
-
|
|
1321
|
-
|
|
1509
|
+
# Check if this is a delta table with schema in parameters
|
|
1510
|
+
if self._is_delta_schema(table):
|
|
1511
|
+
return self._get_delta_schema_metadata(table, table_name, dataset_urn)
|
|
1512
|
+
else:
|
|
1513
|
+
return self._get_glue_schema_metadata(table, table_name)
|
|
1322
1514
|
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1515
|
+
def _is_delta_schema(self, table: Dict) -> bool:
|
|
1516
|
+
"""Check if table uses delta format with schema in parameters."""
|
|
1517
|
+
if not self.source_config.extract_delta_schema_from_parameters:
|
|
1518
|
+
return False
|
|
1519
|
+
|
|
1520
|
+
provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
|
|
1521
|
+
num_parts = int(
|
|
1522
|
+
table.get("Parameters", {}).get("spark.sql.sources.schema.numParts", "0")
|
|
1523
|
+
)
|
|
1524
|
+
columns = table.get("StorageDescriptor", {}).get("Columns", [])
|
|
1525
|
+
|
|
1526
|
+
return (
|
|
1527
|
+
provider == "delta"
|
|
1528
|
+
and num_parts > 0
|
|
1529
|
+
and columns
|
|
1530
|
+
and len(columns) == 1
|
|
1531
|
+
and columns[0].get("Name", "") == "col"
|
|
1532
|
+
and columns[0].get("Type", "") == "array<string>"
|
|
1533
|
+
)
|
|
1534
|
+
|
|
1535
|
+
def _get_glue_schema_metadata(
|
|
1536
|
+
self, table: Dict, table_name: str
|
|
1537
|
+
) -> Optional[SchemaMetadata]:
|
|
1538
|
+
"""Extract schema metadata from Glue table columns."""
|
|
1539
|
+
schema = table["StorageDescriptor"]["Columns"]
|
|
1540
|
+
fields: List[SchemaField] = []
|
|
1541
|
+
|
|
1542
|
+
# Process regular columns
|
|
1543
|
+
for field in schema:
|
|
1544
|
+
schema_fields = get_schema_fields_for_hive_column(
|
|
1545
|
+
hive_column_name=field["Name"],
|
|
1546
|
+
hive_column_type=field["Type"],
|
|
1547
|
+
description=field.get("Comment"),
|
|
1548
|
+
default_nullable=True,
|
|
1549
|
+
)
|
|
1550
|
+
if schema_fields:
|
|
1551
|
+
fields.extend(schema_fields)
|
|
1552
|
+
|
|
1553
|
+
# Process partition keys
|
|
1554
|
+
partition_keys = table.get("PartitionKeys", [])
|
|
1555
|
+
for partition_key in partition_keys:
|
|
1556
|
+
schema_fields = get_schema_fields_for_hive_column(
|
|
1557
|
+
hive_column_name=partition_key["Name"],
|
|
1558
|
+
hive_column_type=partition_key.get("Type", "unknown"),
|
|
1559
|
+
description=partition_key.get("Comment"),
|
|
1560
|
+
default_nullable=False,
|
|
1561
|
+
)
|
|
1562
|
+
if schema_fields:
|
|
1334
1563
|
fields.extend(schema_fields)
|
|
1335
1564
|
|
|
1336
|
-
|
|
1337
|
-
|
|
1565
|
+
return SchemaMetadata(
|
|
1566
|
+
schemaName=table_name,
|
|
1567
|
+
version=0,
|
|
1568
|
+
fields=fields,
|
|
1569
|
+
platform=f"urn:li:dataPlatform:{self.platform}",
|
|
1570
|
+
hash="",
|
|
1571
|
+
platformSchema=MySqlDDL(tableSchema=""),
|
|
1572
|
+
)
|
|
1573
|
+
|
|
1574
|
+
def _get_delta_schema_metadata(
|
|
1575
|
+
self, table: Dict, table_name: str, dataset_urn: str
|
|
1576
|
+
) -> Optional[SchemaMetadata]:
|
|
1577
|
+
"""Extract schema metadata from Delta table parameters."""
|
|
1578
|
+
try:
|
|
1579
|
+
# Reconstruct schema from parameters
|
|
1580
|
+
num_parts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
|
|
1581
|
+
schema_str = "".join(
|
|
1582
|
+
table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
|
|
1583
|
+
for i in range(num_parts)
|
|
1584
|
+
)
|
|
1585
|
+
schema_json = json.loads(schema_str)
|
|
1586
|
+
|
|
1587
|
+
fields: List[SchemaField] = []
|
|
1588
|
+
for field in schema_json["fields"]:
|
|
1589
|
+
field_type = delta_type_to_hive_type(field.get("type", "unknown"))
|
|
1338
1590
|
schema_fields = get_schema_fields_for_hive_column(
|
|
1339
|
-
hive_column_name=
|
|
1340
|
-
hive_column_type=
|
|
1341
|
-
description=
|
|
1342
|
-
default_nullable=
|
|
1591
|
+
hive_column_name=field["name"],
|
|
1592
|
+
hive_column_type=field_type,
|
|
1593
|
+
description=field.get("description"),
|
|
1594
|
+
default_nullable=bool(field.get("nullable", True)),
|
|
1343
1595
|
)
|
|
1344
|
-
|
|
1345
|
-
|
|
1596
|
+
if schema_fields:
|
|
1597
|
+
fields.extend(schema_fields)
|
|
1346
1598
|
|
|
1599
|
+
self.report.num_dataset_valid_delta_schema += 1
|
|
1347
1600
|
return SchemaMetadata(
|
|
1348
1601
|
schemaName=table_name,
|
|
1349
1602
|
version=0,
|
|
@@ -1353,108 +1606,128 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1353
1606
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
1354
1607
|
)
|
|
1355
1608
|
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1609
|
+
except Exception as e:
|
|
1610
|
+
self.report_warning(
|
|
1611
|
+
dataset_urn,
|
|
1612
|
+
f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
|
|
1360
1613
|
)
|
|
1614
|
+
self.report.num_dataset_invalid_delta_schema += 1
|
|
1615
|
+
return None
|
|
1361
1616
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1617
|
+
def _get_data_platform_instance(self) -> DataPlatformInstanceClass:
|
|
1618
|
+
"""Get data platform instance aspect."""
|
|
1619
|
+
return DataPlatformInstanceClass(
|
|
1620
|
+
platform=make_data_platform_urn(self.platform),
|
|
1621
|
+
instance=(
|
|
1622
|
+
make_dataplatform_instance_urn(
|
|
1623
|
+
self.platform, self.source_config.platform_instance
|
|
1369
1624
|
)
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
schema_fields = get_schema_fields_for_hive_column(
|
|
1375
|
-
hive_column_name=field["name"],
|
|
1376
|
-
hive_column_type=field_type,
|
|
1377
|
-
description=field.get("description"),
|
|
1378
|
-
default_nullable=bool(field.get("nullable", True)),
|
|
1379
|
-
)
|
|
1380
|
-
assert schema_fields
|
|
1381
|
-
fields.extend(schema_fields)
|
|
1625
|
+
if self.source_config.platform_instance
|
|
1626
|
+
else None
|
|
1627
|
+
),
|
|
1628
|
+
)
|
|
1382
1629
|
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
hash="",
|
|
1390
|
-
platformSchema=MySqlDDL(tableSchema=""),
|
|
1391
|
-
)
|
|
1630
|
+
@staticmethod
|
|
1631
|
+
@lru_cache(maxsize=None)
|
|
1632
|
+
def _get_ownership(owner: str) -> Optional[OwnershipClass]:
|
|
1633
|
+
"""Get ownership aspect for a given owner."""
|
|
1634
|
+
if not owner:
|
|
1635
|
+
return None
|
|
1392
1636
|
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
)
|
|
1398
|
-
self.report.num_dataset_invalid_delta_schema += 1
|
|
1399
|
-
return None
|
|
1400
|
-
|
|
1401
|
-
def get_data_platform_instance() -> DataPlatformInstanceClass:
|
|
1402
|
-
return DataPlatformInstanceClass(
|
|
1403
|
-
platform=make_data_platform_urn(self.platform),
|
|
1404
|
-
instance=(
|
|
1405
|
-
make_dataplatform_instance_urn(
|
|
1406
|
-
self.platform, self.source_config.platform_instance
|
|
1407
|
-
)
|
|
1408
|
-
if self.source_config.platform_instance
|
|
1409
|
-
else None
|
|
1410
|
-
),
|
|
1637
|
+
owners = [
|
|
1638
|
+
OwnerClass(
|
|
1639
|
+
owner=mce_builder.make_user_urn(owner),
|
|
1640
|
+
type=OwnershipTypeClass.DATAOWNER,
|
|
1411
1641
|
)
|
|
1642
|
+
]
|
|
1643
|
+
return OwnershipClass(owners=owners)
|
|
1412
1644
|
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
type=OwnershipTypeClass.DATAOWNER,
|
|
1420
|
-
)
|
|
1421
|
-
]
|
|
1422
|
-
return OwnershipClass(
|
|
1423
|
-
owners=owners,
|
|
1424
|
-
)
|
|
1645
|
+
def _get_s3_tags(self, table: Dict, dataset_urn: str) -> Optional[GlobalTagsClass]:
|
|
1646
|
+
"""Extract S3 tags if enabled."""
|
|
1647
|
+
if not (
|
|
1648
|
+
self.source_config.use_s3_bucket_tags
|
|
1649
|
+
or self.source_config.use_s3_object_tags
|
|
1650
|
+
):
|
|
1425
1651
|
return None
|
|
1426
1652
|
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
get_dataset_properties(),
|
|
1432
|
-
],
|
|
1433
|
-
)
|
|
1653
|
+
# Check if table has a location (VIRTUAL_VIEW tables may not)
|
|
1654
|
+
location = table.get("StorageDescriptor", {}).get("Location")
|
|
1655
|
+
if not location:
|
|
1656
|
+
return None
|
|
1434
1657
|
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
dataset_snapshot.aspects.append(schema_metadata)
|
|
1658
|
+
bucket_name = s3_util.get_bucket_name(location)
|
|
1659
|
+
tags_to_add: List[str] = []
|
|
1438
1660
|
|
|
1439
|
-
|
|
1661
|
+
# Get bucket tags
|
|
1662
|
+
if self.source_config.use_s3_bucket_tags:
|
|
1663
|
+
try:
|
|
1664
|
+
bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
|
|
1665
|
+
tags_to_add.extend(
|
|
1666
|
+
make_tag_urn(f"{tag['Key']}:{tag['Value']}")
|
|
1667
|
+
for tag in bucket_tags["TagSet"]
|
|
1668
|
+
)
|
|
1669
|
+
except self.s3_client.exceptions.ClientError:
|
|
1670
|
+
logger.warning(f"No tags found for bucket={bucket_name}")
|
|
1440
1671
|
|
|
1441
|
-
#
|
|
1442
|
-
if self.
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1672
|
+
# Get object tags
|
|
1673
|
+
if self.source_config.use_s3_object_tags:
|
|
1674
|
+
key_prefix = s3_util.get_key_prefix(location)
|
|
1675
|
+
try:
|
|
1676
|
+
object_tagging = self.s3_client.get_object_tagging(
|
|
1677
|
+
Bucket=bucket_name, Key=key_prefix
|
|
1678
|
+
)
|
|
1679
|
+
if object_tagging["TagSet"]:
|
|
1680
|
+
tags_to_add.extend(
|
|
1681
|
+
make_tag_urn(f"{tag['Key']}:{tag['Value']}")
|
|
1682
|
+
for tag in object_tagging["TagSet"]
|
|
1683
|
+
)
|
|
1684
|
+
else:
|
|
1685
|
+
logger.warning(
|
|
1686
|
+
f"No tags found for bucket={bucket_name} key={key_prefix}"
|
|
1687
|
+
)
|
|
1688
|
+
except Exception as e:
|
|
1689
|
+
logger.warning(f"Failed to get object tags: {e}")
|
|
1447
1690
|
|
|
1448
|
-
if
|
|
1449
|
-
|
|
1450
|
-
or self.source_config.use_s3_object_tags
|
|
1451
|
-
):
|
|
1452
|
-
s3_tags = get_s3_tags()
|
|
1453
|
-
if s3_tags is not None:
|
|
1454
|
-
dataset_snapshot.aspects.append(s3_tags)
|
|
1691
|
+
if not tags_to_add:
|
|
1692
|
+
return None
|
|
1455
1693
|
|
|
1456
|
-
|
|
1457
|
-
|
|
1694
|
+
# Merge with existing tags if connected to DataHub API
|
|
1695
|
+
if self.ctx.graph:
|
|
1696
|
+
logger.debug("Connected to DatahubApi, grabbing current tags to maintain.")
|
|
1697
|
+
current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
|
|
1698
|
+
entity_urn=dataset_urn, aspect_type=GlobalTagsClass
|
|
1699
|
+
)
|
|
1700
|
+
if current_tags:
|
|
1701
|
+
tags_to_add.extend(current_tag.tag for current_tag in current_tags.tags)
|
|
1702
|
+
else:
|
|
1703
|
+
logger.warning(
|
|
1704
|
+
"Could not connect to DatahubApi. No current tags to maintain"
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
# Remove duplicates and create tags
|
|
1708
|
+
unique_tags = sorted(set(tags_to_add))
|
|
1709
|
+
return GlobalTagsClass(tags=[TagAssociationClass(tag) for tag in unique_tags])
|
|
1710
|
+
|
|
1711
|
+
def _get_lake_formation_tags(
|
|
1712
|
+
self, tags: List[LakeFormationTag]
|
|
1713
|
+
) -> Optional[GlobalTagsClass]:
|
|
1714
|
+
"""Extract Lake Formation tags if enabled."""
|
|
1715
|
+
tag_urns: List[str] = []
|
|
1716
|
+
for tag in tags:
|
|
1717
|
+
try:
|
|
1718
|
+
tag_urns.append(tag.to_datahub_tag_urn().urn())
|
|
1719
|
+
except InvalidUrnError as e:
|
|
1720
|
+
logger.warning(
|
|
1721
|
+
f"Invalid Lake Formation tag URN for {tag}: {e}", exc_info=True
|
|
1722
|
+
)
|
|
1723
|
+
continue # Skip invalid tags
|
|
1724
|
+
|
|
1725
|
+
tag_urns.sort() # Sort to maintain consistent order
|
|
1726
|
+
return (
|
|
1727
|
+
GlobalTagsClass(tags=[TagAssociationClass(tag_urn) for tag_urn in tag_urns])
|
|
1728
|
+
if tag_urns
|
|
1729
|
+
else None
|
|
1730
|
+
)
|
|
1458
1731
|
|
|
1459
1732
|
def get_report(self):
|
|
1460
1733
|
return self.report
|