acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datahub.api.entities.external.external_entities import (
|
|
5
|
+
PlatformResourceRepository,
|
|
6
|
+
)
|
|
7
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
8
|
+
from datahub.ingestion.source.aws.tag_entities import (
|
|
9
|
+
LakeFormationTagPlatformResource,
|
|
10
|
+
LakeFormationTagPlatformResourceId,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GluePlatformResourceRepository(
|
|
17
|
+
PlatformResourceRepository[
|
|
18
|
+
LakeFormationTagPlatformResourceId, LakeFormationTagPlatformResource
|
|
19
|
+
]
|
|
20
|
+
):
|
|
21
|
+
"""AWS Glue-specific platform resource repository with tag-related operations."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
graph: DataHubGraph,
|
|
26
|
+
platform_instance: Optional[str] = None,
|
|
27
|
+
catalog: Optional[str] = None,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(graph, platform_instance)
|
|
30
|
+
self.catalog = catalog
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datahub.emitter.mce_builder import make_tag_urn
|
|
5
6
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
11
12
|
)
|
|
12
13
|
from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
17
|
+
|
|
14
18
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
15
19
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
21
|
+
LIST_OBJECTS_PAGE_SIZE = 1000
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
def get_s3_tags(
|
|
19
25
|
bucket_name: str,
|
|
@@ -74,16 +80,82 @@ def get_s3_tags(
|
|
|
74
80
|
return new_tags
|
|
75
81
|
|
|
76
82
|
|
|
83
|
+
@dataclass
|
|
84
|
+
class DirEntry:
|
|
85
|
+
"""
|
|
86
|
+
Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
|
|
87
|
+
other attributes of a directory entry. Currently only used to represent S3 folder-like
|
|
88
|
+
paths.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
path: str
|
|
93
|
+
|
|
94
|
+
|
|
77
95
|
def list_folders_path(
|
|
78
|
-
s3_uri: str,
|
|
79
|
-
|
|
96
|
+
s3_uri: str,
|
|
97
|
+
*,
|
|
98
|
+
startswith: str = "",
|
|
99
|
+
aws_config: Optional[AwsConnectionConfig] = None,
|
|
100
|
+
) -> Iterable[DirEntry]:
|
|
101
|
+
"""
|
|
102
|
+
Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
|
|
103
|
+
optionally filtering by startswith. Returned entries never contain a trailing slash.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
if not is_s3_uri(s3_uri):
|
|
107
|
+
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
108
|
+
if aws_config is None:
|
|
109
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
110
|
+
|
|
111
|
+
if not s3_uri.endswith("/"):
|
|
112
|
+
s3_uri += "/"
|
|
113
|
+
|
|
114
|
+
bucket_name = get_bucket_name(s3_uri)
|
|
115
|
+
if not bucket_name:
|
|
116
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
117
|
+
# prefix.
|
|
118
|
+
for folder in list_buckets(startswith, aws_config):
|
|
119
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
123
|
+
for folder in list_folders(bucket_name, prefix, aws_config):
|
|
124
|
+
folder = folder.removesuffix("/").split("/")[-1]
|
|
125
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def list_objects_recursive_path(
|
|
129
|
+
s3_uri: str,
|
|
130
|
+
*,
|
|
131
|
+
startswith: str = "",
|
|
132
|
+
aws_config: Optional[AwsConnectionConfig] = None,
|
|
133
|
+
) -> Iterable["ObjectSummary"]:
|
|
134
|
+
"""
|
|
135
|
+
Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
|
|
136
|
+
filtering by startswith.
|
|
137
|
+
"""
|
|
138
|
+
|
|
80
139
|
if not is_s3_uri(s3_uri):
|
|
81
140
|
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
82
141
|
if aws_config is None:
|
|
83
142
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
143
|
+
if startswith and "/" in startswith:
|
|
144
|
+
raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
|
|
145
|
+
|
|
146
|
+
if not s3_uri.endswith("/"):
|
|
147
|
+
s3_uri += "/"
|
|
148
|
+
|
|
84
149
|
bucket_name = get_bucket_name(s3_uri)
|
|
85
|
-
|
|
86
|
-
|
|
150
|
+
if not bucket_name:
|
|
151
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
152
|
+
# prefix.
|
|
153
|
+
for bucket_name in list_buckets(startswith, aws_config):
|
|
154
|
+
yield from list_objects_recursive(bucket_name, "", aws_config)
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
158
|
+
yield from list_objects_recursive(bucket_name, prefix, aws_config)
|
|
87
159
|
|
|
88
160
|
|
|
89
161
|
def list_folders(
|
|
@@ -99,3 +171,26 @@ def list_folders(
|
|
|
99
171
|
if folder.endswith("/"):
|
|
100
172
|
folder = folder[:-1]
|
|
101
173
|
yield f"{folder}"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def list_buckets(
|
|
177
|
+
prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
178
|
+
) -> Iterable[str]:
|
|
179
|
+
if aws_config is None:
|
|
180
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
181
|
+
s3_client = aws_config.get_s3_client()
|
|
182
|
+
paginator = s3_client.get_paginator("list_buckets")
|
|
183
|
+
for page in paginator.paginate(Prefix=prefix):
|
|
184
|
+
for o in page.get("Buckets", []):
|
|
185
|
+
yield str(o.get("Name"))
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def list_objects_recursive(
|
|
189
|
+
bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
190
|
+
) -> Iterable["ObjectSummary"]:
|
|
191
|
+
if aws_config is None:
|
|
192
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
193
|
+
s3_resource = aws_config.get_s3_resource()
|
|
194
|
+
bucket = s3_resource.Bucket(bucket_name)
|
|
195
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
|
|
196
|
+
yield obj
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.aws.platform_resource_repository import (
|
|
6
|
+
GluePlatformResourceRepository,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from datahub.api.entities.external.external_entities import (
|
|
12
|
+
ExternalEntity,
|
|
13
|
+
ExternalEntityId,
|
|
14
|
+
LinkedResourceSet,
|
|
15
|
+
)
|
|
16
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
17
|
+
LakeFormationTag,
|
|
18
|
+
)
|
|
19
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
20
|
+
PlatformResource,
|
|
21
|
+
PlatformResourceKey,
|
|
22
|
+
)
|
|
23
|
+
from datahub.metadata.urns import TagUrn
|
|
24
|
+
from datahub.utilities.urns.urn import Urn
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LakeFormationTagSyncContext(BaseModel):
|
|
30
|
+
# it is intentionally empty
|
|
31
|
+
platform_instance: Optional[str] = None
|
|
32
|
+
catalog: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
# Making it compatible with SyncContext interface
|
|
35
|
+
def get_platform_instance(self) -> Optional[str]:
|
|
36
|
+
return self.platform_instance
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LakeFormationTagPlatformResourceId(ExternalEntityId):
|
|
40
|
+
"""
|
|
41
|
+
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
tag_key: str
|
|
45
|
+
tag_value: Optional[str] = None
|
|
46
|
+
platform_instance: Optional[str] = None
|
|
47
|
+
catalog: Optional[str] = None
|
|
48
|
+
exists_in_lake_formation: bool = False
|
|
49
|
+
persisted: bool = False
|
|
50
|
+
|
|
51
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _RESOURCE_TYPE() -> str:
|
|
54
|
+
return "LakeFormationTagPlatformResource"
|
|
55
|
+
|
|
56
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
57
|
+
return PlatformResourceKey(
|
|
58
|
+
platform="glue",
|
|
59
|
+
resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
60
|
+
primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
|
|
61
|
+
if self.catalog
|
|
62
|
+
else f"{self.tag_key}:{self.tag_value}",
|
|
63
|
+
platform_instance=self.platform_instance,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def get_or_create_from_tag(
|
|
68
|
+
cls,
|
|
69
|
+
tag: LakeFormationTag,
|
|
70
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
71
|
+
exists_in_lake_formation: bool = False,
|
|
72
|
+
catalog_id: Optional[str] = None,
|
|
73
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
74
|
+
"""
|
|
75
|
+
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Use catalog_id if provided, otherwise fall back to repository catalog
|
|
79
|
+
effective_catalog = catalog_id or platform_resource_repository.catalog
|
|
80
|
+
|
|
81
|
+
existing_platform_resource = cls.search_by_urn(
|
|
82
|
+
tag.to_datahub_tag_urn().urn(),
|
|
83
|
+
platform_resource_repository=platform_resource_repository,
|
|
84
|
+
tag_sync_context=LakeFormationTagSyncContext(
|
|
85
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
86
|
+
catalog=effective_catalog,
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
if existing_platform_resource:
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
|
|
92
|
+
)
|
|
93
|
+
return existing_platform_resource
|
|
94
|
+
|
|
95
|
+
return LakeFormationTagPlatformResourceId(
|
|
96
|
+
tag_key=str(tag.key),
|
|
97
|
+
tag_value=str(tag.value) if tag.value is not None else None,
|
|
98
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
99
|
+
catalog=effective_catalog,
|
|
100
|
+
exists_in_lake_formation=exists_in_lake_formation,
|
|
101
|
+
persisted=False,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def search_by_urn(
|
|
106
|
+
cls,
|
|
107
|
+
urn: str,
|
|
108
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
109
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
110
|
+
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
111
|
+
"""
|
|
112
|
+
Search for existing Lake Formation tag entity by URN using repository caching.
|
|
113
|
+
|
|
114
|
+
This method now delegates to the repository's search_entity_by_urn method to ensure
|
|
115
|
+
consistent caching behavior across all platform implementations.
|
|
116
|
+
"""
|
|
117
|
+
# Use repository's cached search method instead of duplicating search logic
|
|
118
|
+
existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
|
|
119
|
+
|
|
120
|
+
if existing_entity_id:
|
|
121
|
+
# Verify platform instance and catalog match
|
|
122
|
+
if (
|
|
123
|
+
existing_entity_id.platform_instance
|
|
124
|
+
== tag_sync_context.platform_instance
|
|
125
|
+
and existing_entity_id.catalog == tag_sync_context.catalog
|
|
126
|
+
):
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
|
|
129
|
+
)
|
|
130
|
+
# Create a new ID with the correct state instead of mutating
|
|
131
|
+
return LakeFormationTagPlatformResourceId(
|
|
132
|
+
tag_key=existing_entity_id.tag_key,
|
|
133
|
+
tag_value=existing_entity_id.tag_value,
|
|
134
|
+
platform_instance=existing_entity_id.platform_instance,
|
|
135
|
+
catalog=existing_entity_id.catalog,
|
|
136
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
137
|
+
persisted=True, # And it's persisted in DataHub
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
logger.info(
|
|
141
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
142
|
+
)
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def from_datahub_urn(
|
|
147
|
+
cls,
|
|
148
|
+
urn: str,
|
|
149
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
150
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
151
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
152
|
+
"""
|
|
153
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
154
|
+
"""
|
|
155
|
+
# First we check if we already have a mapped platform resource for this
|
|
156
|
+
# urn that is of the type UnityCatalogTagPlatformResource
|
|
157
|
+
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
158
|
+
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
159
|
+
existing_platform_resource_id = cls.search_by_urn(
|
|
160
|
+
urn, platform_resource_repository, tag_sync_context
|
|
161
|
+
)
|
|
162
|
+
if existing_platform_resource_id:
|
|
163
|
+
logger.info(
|
|
164
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
165
|
+
)
|
|
166
|
+
return existing_platform_resource_id
|
|
167
|
+
|
|
168
|
+
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
169
|
+
new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
|
|
170
|
+
if new_tag_id:
|
|
171
|
+
# we then check if this tag has already been ingested as a platform
|
|
172
|
+
# resource in the platform resource repository
|
|
173
|
+
resource_key = platform_resource_repository.get(
|
|
174
|
+
new_tag_id.to_platform_resource_key()
|
|
175
|
+
)
|
|
176
|
+
if resource_key:
|
|
177
|
+
logger.info(
|
|
178
|
+
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
179
|
+
)
|
|
180
|
+
# Create a new ID with the correct state instead of mutating
|
|
181
|
+
return LakeFormationTagPlatformResourceId(
|
|
182
|
+
tag_key=new_tag_id.tag_key,
|
|
183
|
+
tag_value=new_tag_id.tag_value,
|
|
184
|
+
platform_instance=new_tag_id.platform_instance,
|
|
185
|
+
catalog=new_tag_id.catalog,
|
|
186
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
187
|
+
persisted=new_tag_id.persisted,
|
|
188
|
+
)
|
|
189
|
+
return new_tag_id
|
|
190
|
+
raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def generate_tag_id(
|
|
194
|
+
cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
|
|
195
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
196
|
+
parsed_urn = Urn.from_string(urn)
|
|
197
|
+
entity_type = parsed_urn.entity_type
|
|
198
|
+
if entity_type == "tag":
|
|
199
|
+
new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
|
|
200
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
204
|
+
return new_tag_id
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def from_datahub_tag(
|
|
208
|
+
cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
|
|
209
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
210
|
+
tag = LakeFormationTag.from_urn(tag_urn)
|
|
211
|
+
|
|
212
|
+
return LakeFormationTagPlatformResourceId(
|
|
213
|
+
tag_key=str(tag.key),
|
|
214
|
+
tag_value=str(tag.value),
|
|
215
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
216
|
+
catalog=tag_sync_context.catalog,
|
|
217
|
+
exists_in_lake_formation=False,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class LakeFormationTagPlatformResource(ExternalEntity):
|
|
222
|
+
datahub_urns: LinkedResourceSet
|
|
223
|
+
managed_by_datahub: bool
|
|
224
|
+
id: LakeFormationTagPlatformResourceId
|
|
225
|
+
allowed_values: Optional[List[str]] = None
|
|
226
|
+
|
|
227
|
+
def get_id(self) -> ExternalEntityId:
|
|
228
|
+
return self.id
|
|
229
|
+
|
|
230
|
+
def is_managed_by_datahub(self) -> bool:
|
|
231
|
+
return self.managed_by_datahub
|
|
232
|
+
|
|
233
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
234
|
+
return self.datahub_urns
|
|
235
|
+
|
|
236
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
237
|
+
return PlatformResource.create(
|
|
238
|
+
key=self.id.to_platform_resource_key(),
|
|
239
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
240
|
+
value=self,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def create_default(
|
|
245
|
+
cls,
|
|
246
|
+
entity_id: ExternalEntityId,
|
|
247
|
+
managed_by_datahub: bool,
|
|
248
|
+
) -> "LakeFormationTagPlatformResource":
|
|
249
|
+
"""Create a default Lake Formation tag entity when none found in DataHub."""
|
|
250
|
+
# Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
|
|
251
|
+
assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
|
|
252
|
+
f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
256
|
+
default_entity_id = LakeFormationTagPlatformResourceId(
|
|
257
|
+
tag_key=entity_id.tag_key,
|
|
258
|
+
tag_value=entity_id.tag_value,
|
|
259
|
+
platform_instance=entity_id.platform_instance,
|
|
260
|
+
catalog=entity_id.catalog,
|
|
261
|
+
exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
|
|
262
|
+
persisted=False, # New entities are not persisted yet
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return cls(
|
|
266
|
+
id=default_entity_id,
|
|
267
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
268
|
+
managed_by_datahub=managed_by_datahub,
|
|
269
|
+
allowed_values=None,
|
|
270
|
+
)
|
|
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
61
61
|
def get_blob_service_client(self):
|
|
62
62
|
return BlobServiceClient(
|
|
63
63
|
account_url=f"https://{self.account_name}.blob.core.windows.net",
|
|
64
|
-
credential=
|
|
64
|
+
credential=self.get_credentials(),
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
def get_data_lake_service_client(self) -> DataLakeServiceClient:
|
|
68
68
|
return DataLakeServiceClient(
|
|
69
69
|
account_url=f"https://{self.account_name}.dfs.core.windows.net",
|
|
70
|
-
credential=
|
|
70
|
+
credential=self.get_credentials(),
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
def get_credentials(
|
|
@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
81
81
|
)
|
|
82
82
|
return self.sas_token if self.sas_token is not None else self.account_key
|
|
83
83
|
|
|
84
|
-
@root_validator()
|
|
84
|
+
@root_validator(skip_on_failure=True)
|
|
85
85
|
def _check_credential_values(cls, values: Dict) -> Dict:
|
|
86
86
|
if (
|
|
87
87
|
values.get("account_key")
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -44,9 +45,11 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
|
|
|
44
45
|
BigQueryQueriesExtractorConfig,
|
|
45
46
|
)
|
|
46
47
|
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
48
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
49
51
|
RedundantLineageRunSkipHandler,
|
|
52
|
+
RedundantQueriesRunSkipHandler,
|
|
50
53
|
RedundantUsageRunSkipHandler,
|
|
51
54
|
)
|
|
52
55
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -77,7 +80,14 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
77
80
|
supported=False,
|
|
78
81
|
)
|
|
79
82
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
80
|
-
@capability(
|
|
83
|
+
@capability(
|
|
84
|
+
SourceCapability.CONTAINERS,
|
|
85
|
+
"Enabled by default",
|
|
86
|
+
subtype_modifier=[
|
|
87
|
+
SourceCapabilityModifier.BIGQUERY_PROJECT,
|
|
88
|
+
SourceCapabilityModifier.BIGQUERY_DATASET,
|
|
89
|
+
],
|
|
90
|
+
)
|
|
81
91
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
82
92
|
@capability(
|
|
83
93
|
SourceCapability.DATA_PROFILING,
|
|
@@ -99,6 +109,7 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
99
109
|
SourceCapability.PARTITION_SUPPORT,
|
|
100
110
|
"Enabled by default, partition keys and clustering keys are supported.",
|
|
101
111
|
)
|
|
112
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
102
113
|
class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
103
114
|
def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
|
|
104
115
|
super().__init__(config, ctx)
|
|
@@ -135,7 +146,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
135
146
|
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
136
147
|
None
|
|
137
148
|
)
|
|
138
|
-
if
|
|
149
|
+
if (
|
|
150
|
+
self.config.enable_stateful_lineage_ingestion
|
|
151
|
+
and not self.config.use_queries_v2
|
|
152
|
+
):
|
|
139
153
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
140
154
|
source=self,
|
|
141
155
|
config=self.config,
|
|
@@ -241,7 +255,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
241
255
|
).workunit_processor,
|
|
242
256
|
]
|
|
243
257
|
|
|
258
|
+
def _warn_deprecated_configs(self):
|
|
259
|
+
if (
|
|
260
|
+
self.config.match_fully_qualified_names is not None
|
|
261
|
+
and not self.config.match_fully_qualified_names
|
|
262
|
+
and self.config.schema_pattern is not None
|
|
263
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
264
|
+
):
|
|
265
|
+
self.report.report_warning(
|
|
266
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
267
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
268
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
269
|
+
context="Config option deprecation warning",
|
|
270
|
+
title="Config option deprecation warning",
|
|
271
|
+
)
|
|
272
|
+
|
|
244
273
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
274
|
+
self._warn_deprecated_configs()
|
|
245
275
|
projects = get_projects(
|
|
246
276
|
self.bq_schema_extractor.schema_api,
|
|
247
277
|
self.report,
|
|
@@ -270,28 +300,41 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
300
|
):
|
|
271
301
|
return
|
|
272
302
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
303
|
+
redundant_queries_run_skip_handler: Optional[
|
|
304
|
+
RedundantQueriesRunSkipHandler
|
|
305
|
+
] = None
|
|
306
|
+
if self.config.enable_stateful_time_window:
|
|
307
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
308
|
+
source=self,
|
|
309
|
+
config=self.config,
|
|
310
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
311
|
+
run_id=self.ctx.run_id,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
with (
|
|
315
|
+
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
316
|
+
BigQueryQueriesExtractor(
|
|
317
|
+
connection=self.config.get_bigquery_client(),
|
|
318
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
319
|
+
config=BigQueryQueriesExtractorConfig(
|
|
320
|
+
window=self.config,
|
|
321
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
322
|
+
include_lineage=self.config.include_table_lineage,
|
|
323
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
324
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
325
|
+
include_queries=self.config.include_queries,
|
|
326
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
327
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
328
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
329
|
+
),
|
|
330
|
+
structured_report=self.report,
|
|
331
|
+
filters=self.filters,
|
|
332
|
+
identifiers=self.identifiers,
|
|
333
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
334
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
335
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
336
|
+
) as queries_extractor,
|
|
337
|
+
):
|
|
295
338
|
self.report.queries_extractor = queries_extractor.report
|
|
296
339
|
yield from queries_extractor.get_workunits_internal()
|
|
297
340
|
else:
|