acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datahub.api.entities.external.external_entities import (
|
|
5
|
+
PlatformResourceRepository,
|
|
6
|
+
)
|
|
7
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
8
|
+
from datahub.ingestion.source.aws.tag_entities import (
|
|
9
|
+
LakeFormationTagPlatformResource,
|
|
10
|
+
LakeFormationTagPlatformResourceId,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GluePlatformResourceRepository(
|
|
17
|
+
PlatformResourceRepository[
|
|
18
|
+
LakeFormationTagPlatformResourceId, LakeFormationTagPlatformResource
|
|
19
|
+
]
|
|
20
|
+
):
|
|
21
|
+
"""AWS Glue-specific platform resource repository with tag-related operations."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
graph: DataHubGraph,
|
|
26
|
+
platform_instance: Optional[str] = None,
|
|
27
|
+
catalog: Optional[str] = None,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(graph, platform_instance)
|
|
30
|
+
self.catalog = catalog
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datahub.emitter.mce_builder import make_tag_urn
|
|
5
6
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
11
12
|
)
|
|
12
13
|
from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
17
|
+
|
|
14
18
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
15
19
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
21
|
+
LIST_OBJECTS_PAGE_SIZE = 1000
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
def get_s3_tags(
|
|
19
25
|
bucket_name: str,
|
|
@@ -74,16 +80,82 @@ def get_s3_tags(
|
|
|
74
80
|
return new_tags
|
|
75
81
|
|
|
76
82
|
|
|
83
|
+
@dataclass
|
|
84
|
+
class DirEntry:
|
|
85
|
+
"""
|
|
86
|
+
Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
|
|
87
|
+
other attributes of a directory entry. Currently only used to represent S3 folder-like
|
|
88
|
+
paths.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
path: str
|
|
93
|
+
|
|
94
|
+
|
|
77
95
|
def list_folders_path(
|
|
78
|
-
s3_uri: str,
|
|
79
|
-
|
|
96
|
+
s3_uri: str,
|
|
97
|
+
*,
|
|
98
|
+
startswith: str = "",
|
|
99
|
+
aws_config: Optional[AwsConnectionConfig] = None,
|
|
100
|
+
) -> Iterable[DirEntry]:
|
|
101
|
+
"""
|
|
102
|
+
Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
|
|
103
|
+
optionally filtering by startswith. Returned entries never contain a trailing slash.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
if not is_s3_uri(s3_uri):
|
|
107
|
+
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
108
|
+
if aws_config is None:
|
|
109
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
110
|
+
|
|
111
|
+
if not s3_uri.endswith("/"):
|
|
112
|
+
s3_uri += "/"
|
|
113
|
+
|
|
114
|
+
bucket_name = get_bucket_name(s3_uri)
|
|
115
|
+
if not bucket_name:
|
|
116
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
117
|
+
# prefix.
|
|
118
|
+
for folder in list_buckets(startswith, aws_config):
|
|
119
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
123
|
+
for folder in list_folders(bucket_name, prefix, aws_config):
|
|
124
|
+
folder = folder.removesuffix("/").split("/")[-1]
|
|
125
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def list_objects_recursive_path(
|
|
129
|
+
s3_uri: str,
|
|
130
|
+
*,
|
|
131
|
+
startswith: str = "",
|
|
132
|
+
aws_config: Optional[AwsConnectionConfig] = None,
|
|
133
|
+
) -> Iterable["ObjectSummary"]:
|
|
134
|
+
"""
|
|
135
|
+
Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
|
|
136
|
+
filtering by startswith.
|
|
137
|
+
"""
|
|
138
|
+
|
|
80
139
|
if not is_s3_uri(s3_uri):
|
|
81
140
|
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
82
141
|
if aws_config is None:
|
|
83
142
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
143
|
+
if startswith and "/" in startswith:
|
|
144
|
+
raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
|
|
145
|
+
|
|
146
|
+
if not s3_uri.endswith("/"):
|
|
147
|
+
s3_uri += "/"
|
|
148
|
+
|
|
84
149
|
bucket_name = get_bucket_name(s3_uri)
|
|
85
|
-
|
|
86
|
-
|
|
150
|
+
if not bucket_name:
|
|
151
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
152
|
+
# prefix.
|
|
153
|
+
for bucket_name in list_buckets(startswith, aws_config):
|
|
154
|
+
yield from list_objects_recursive(bucket_name, "", aws_config)
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
158
|
+
yield from list_objects_recursive(bucket_name, prefix, aws_config)
|
|
87
159
|
|
|
88
160
|
|
|
89
161
|
def list_folders(
|
|
@@ -99,3 +171,26 @@ def list_folders(
|
|
|
99
171
|
if folder.endswith("/"):
|
|
100
172
|
folder = folder[:-1]
|
|
101
173
|
yield f"{folder}"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def list_buckets(
|
|
177
|
+
prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
178
|
+
) -> Iterable[str]:
|
|
179
|
+
if aws_config is None:
|
|
180
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
181
|
+
s3_client = aws_config.get_s3_client()
|
|
182
|
+
paginator = s3_client.get_paginator("list_buckets")
|
|
183
|
+
for page in paginator.paginate(Prefix=prefix):
|
|
184
|
+
for o in page.get("Buckets", []):
|
|
185
|
+
yield str(o.get("Name"))
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def list_objects_recursive(
|
|
189
|
+
bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
190
|
+
) -> Iterable["ObjectSummary"]:
|
|
191
|
+
if aws_config is None:
|
|
192
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
193
|
+
s3_resource = aws_config.get_s3_resource()
|
|
194
|
+
bucket = s3_resource.Bucket(bucket_name)
|
|
195
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
|
|
196
|
+
yield obj
|
|
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
|
|
|
205
205
|
textwrap.dedent(
|
|
206
206
|
f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
|
|
207
207
|
To view full table metadata, run Glue ingestion
|
|
208
|
-
(see https://
|
|
208
|
+
(see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
|
|
209
209
|
)
|
|
210
210
|
)
|
|
211
211
|
|
|
@@ -323,7 +323,7 @@ class ModelProcessor:
|
|
|
323
323
|
model_training_jobs = model_training_jobs.union(
|
|
324
324
|
{
|
|
325
325
|
job_urn
|
|
326
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
326
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
327
327
|
if job_direction == JobDirection.TRAINING
|
|
328
328
|
}
|
|
329
329
|
)
|
|
@@ -331,7 +331,7 @@ class ModelProcessor:
|
|
|
331
331
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
332
332
|
{
|
|
333
333
|
job_urn
|
|
334
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
334
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
335
335
|
if job_direction == JobDirection.DOWNSTREAM
|
|
336
336
|
}
|
|
337
337
|
)
|
|
@@ -368,7 +368,7 @@ class ModelProcessor:
|
|
|
368
368
|
model_training_jobs = model_training_jobs.union(
|
|
369
369
|
{
|
|
370
370
|
job_urn
|
|
371
|
-
for job_urn, job_direction in name_matched_jobs
|
|
371
|
+
for job_urn, job_direction in name_matched_jobs
|
|
372
372
|
if job_direction == JobDirection.TRAINING
|
|
373
373
|
}
|
|
374
374
|
)
|
|
@@ -376,7 +376,7 @@ class ModelProcessor:
|
|
|
376
376
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
377
377
|
{
|
|
378
378
|
job_urn
|
|
379
|
-
for job_urn, job_direction in name_matched_jobs
|
|
379
|
+
for job_urn, job_direction in name_matched_jobs
|
|
380
380
|
if job_direction == JobDirection.DOWNSTREAM
|
|
381
381
|
}
|
|
382
382
|
)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from datahub.ingestion.source.aws.platform_resource_repository import (
|
|
6
|
+
GluePlatformResourceRepository,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from datahub.api.entities.external.external_entities import (
|
|
12
|
+
ExternalEntity,
|
|
13
|
+
ExternalEntityId,
|
|
14
|
+
LinkedResourceSet,
|
|
15
|
+
)
|
|
16
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
17
|
+
LakeFormationTag,
|
|
18
|
+
)
|
|
19
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
20
|
+
PlatformResource,
|
|
21
|
+
PlatformResourceKey,
|
|
22
|
+
)
|
|
23
|
+
from datahub.metadata.urns import TagUrn
|
|
24
|
+
from datahub.utilities.urns.urn import Urn
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LakeFormationTagSyncContext(BaseModel):
|
|
30
|
+
# it is intentionally empty
|
|
31
|
+
platform_instance: Optional[str] = None
|
|
32
|
+
catalog: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
# Making it compatible with SyncContext interface
|
|
35
|
+
def get_platform_instance(self) -> Optional[str]:
|
|
36
|
+
return self.platform_instance
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LakeFormationTagPlatformResourceId(ExternalEntityId):
|
|
40
|
+
"""
|
|
41
|
+
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
tag_key: str
|
|
45
|
+
tag_value: Optional[str] = None
|
|
46
|
+
platform_instance: Optional[str] = None
|
|
47
|
+
catalog: Optional[str] = None
|
|
48
|
+
exists_in_lake_formation: bool = False
|
|
49
|
+
persisted: bool = False
|
|
50
|
+
|
|
51
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _RESOURCE_TYPE() -> str:
|
|
54
|
+
return "LakeFormationTagPlatformResource"
|
|
55
|
+
|
|
56
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
57
|
+
return PlatformResourceKey(
|
|
58
|
+
platform="glue",
|
|
59
|
+
resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
60
|
+
primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
|
|
61
|
+
if self.catalog
|
|
62
|
+
else f"{self.tag_key}:{self.tag_value}",
|
|
63
|
+
platform_instance=self.platform_instance,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def get_or_create_from_tag(
|
|
68
|
+
cls,
|
|
69
|
+
tag: LakeFormationTag,
|
|
70
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
71
|
+
exists_in_lake_formation: bool = False,
|
|
72
|
+
catalog_id: Optional[str] = None,
|
|
73
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
74
|
+
"""
|
|
75
|
+
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Use catalog_id if provided, otherwise fall back to repository catalog
|
|
79
|
+
effective_catalog = catalog_id or platform_resource_repository.catalog
|
|
80
|
+
|
|
81
|
+
existing_platform_resource = cls.search_by_urn(
|
|
82
|
+
tag.to_datahub_tag_urn().urn(),
|
|
83
|
+
platform_resource_repository=platform_resource_repository,
|
|
84
|
+
tag_sync_context=LakeFormationTagSyncContext(
|
|
85
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
86
|
+
catalog=effective_catalog,
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
if existing_platform_resource:
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
|
|
92
|
+
)
|
|
93
|
+
return existing_platform_resource
|
|
94
|
+
|
|
95
|
+
return LakeFormationTagPlatformResourceId(
|
|
96
|
+
tag_key=str(tag.key),
|
|
97
|
+
tag_value=str(tag.value) if tag.value is not None else None,
|
|
98
|
+
platform_instance=platform_resource_repository.platform_instance,
|
|
99
|
+
catalog=effective_catalog,
|
|
100
|
+
exists_in_lake_formation=exists_in_lake_formation,
|
|
101
|
+
persisted=False,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def search_by_urn(
|
|
106
|
+
cls,
|
|
107
|
+
urn: str,
|
|
108
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
109
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
110
|
+
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
111
|
+
"""
|
|
112
|
+
Search for existing Lake Formation tag entity by URN using repository caching.
|
|
113
|
+
|
|
114
|
+
This method now delegates to the repository's search_entity_by_urn method to ensure
|
|
115
|
+
consistent caching behavior across all platform implementations.
|
|
116
|
+
"""
|
|
117
|
+
# Use repository's cached search method instead of duplicating search logic
|
|
118
|
+
existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
|
|
119
|
+
|
|
120
|
+
if existing_entity_id:
|
|
121
|
+
# Verify platform instance and catalog match
|
|
122
|
+
if (
|
|
123
|
+
existing_entity_id.platform_instance
|
|
124
|
+
== tag_sync_context.platform_instance
|
|
125
|
+
and existing_entity_id.catalog == tag_sync_context.catalog
|
|
126
|
+
):
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
|
|
129
|
+
)
|
|
130
|
+
# Create a new ID with the correct state instead of mutating
|
|
131
|
+
return LakeFormationTagPlatformResourceId(
|
|
132
|
+
tag_key=existing_entity_id.tag_key,
|
|
133
|
+
tag_value=existing_entity_id.tag_value,
|
|
134
|
+
platform_instance=existing_entity_id.platform_instance,
|
|
135
|
+
catalog=existing_entity_id.catalog,
|
|
136
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
137
|
+
persisted=True, # And it's persisted in DataHub
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
logger.info(
|
|
141
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
142
|
+
)
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def from_datahub_urn(
|
|
147
|
+
cls,
|
|
148
|
+
urn: str,
|
|
149
|
+
platform_resource_repository: "GluePlatformResourceRepository",
|
|
150
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
151
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
152
|
+
"""
|
|
153
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
154
|
+
"""
|
|
155
|
+
# First we check if we already have a mapped platform resource for this
|
|
156
|
+
# urn that is of the type UnityCatalogTagPlatformResource
|
|
157
|
+
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
158
|
+
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
159
|
+
existing_platform_resource_id = cls.search_by_urn(
|
|
160
|
+
urn, platform_resource_repository, tag_sync_context
|
|
161
|
+
)
|
|
162
|
+
if existing_platform_resource_id:
|
|
163
|
+
logger.info(
|
|
164
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
165
|
+
)
|
|
166
|
+
return existing_platform_resource_id
|
|
167
|
+
|
|
168
|
+
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
169
|
+
new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
|
|
170
|
+
if new_tag_id:
|
|
171
|
+
# we then check if this tag has already been ingested as a platform
|
|
172
|
+
# resource in the platform resource repository
|
|
173
|
+
resource_key = platform_resource_repository.get(
|
|
174
|
+
new_tag_id.to_platform_resource_key()
|
|
175
|
+
)
|
|
176
|
+
if resource_key:
|
|
177
|
+
logger.info(
|
|
178
|
+
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
179
|
+
)
|
|
180
|
+
# Create a new ID with the correct state instead of mutating
|
|
181
|
+
return LakeFormationTagPlatformResourceId(
|
|
182
|
+
tag_key=new_tag_id.tag_key,
|
|
183
|
+
tag_value=new_tag_id.tag_value,
|
|
184
|
+
platform_instance=new_tag_id.platform_instance,
|
|
185
|
+
catalog=new_tag_id.catalog,
|
|
186
|
+
exists_in_lake_formation=True, # This tag exists in Lake Formation
|
|
187
|
+
persisted=new_tag_id.persisted,
|
|
188
|
+
)
|
|
189
|
+
return new_tag_id
|
|
190
|
+
raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def generate_tag_id(
|
|
194
|
+
cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
|
|
195
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
196
|
+
parsed_urn = Urn.from_string(urn)
|
|
197
|
+
entity_type = parsed_urn.entity_type
|
|
198
|
+
if entity_type == "tag":
|
|
199
|
+
new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
|
|
200
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
204
|
+
return new_tag_id
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def from_datahub_tag(
|
|
208
|
+
cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
|
|
209
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
210
|
+
tag = LakeFormationTag.from_urn(tag_urn)
|
|
211
|
+
|
|
212
|
+
return LakeFormationTagPlatformResourceId(
|
|
213
|
+
tag_key=str(tag.key),
|
|
214
|
+
tag_value=str(tag.value),
|
|
215
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
216
|
+
catalog=tag_sync_context.catalog,
|
|
217
|
+
exists_in_lake_formation=False,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class LakeFormationTagPlatformResource(ExternalEntity):
|
|
222
|
+
datahub_urns: LinkedResourceSet
|
|
223
|
+
managed_by_datahub: bool
|
|
224
|
+
id: LakeFormationTagPlatformResourceId
|
|
225
|
+
allowed_values: Optional[List[str]] = None
|
|
226
|
+
|
|
227
|
+
def get_id(self) -> ExternalEntityId:
|
|
228
|
+
return self.id
|
|
229
|
+
|
|
230
|
+
def is_managed_by_datahub(self) -> bool:
|
|
231
|
+
return self.managed_by_datahub
|
|
232
|
+
|
|
233
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
234
|
+
return self.datahub_urns
|
|
235
|
+
|
|
236
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
237
|
+
return PlatformResource.create(
|
|
238
|
+
key=self.id.to_platform_resource_key(),
|
|
239
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
240
|
+
value=self,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def create_default(
|
|
245
|
+
cls,
|
|
246
|
+
entity_id: ExternalEntityId,
|
|
247
|
+
managed_by_datahub: bool,
|
|
248
|
+
) -> "LakeFormationTagPlatformResource":
|
|
249
|
+
"""Create a default Lake Formation tag entity when none found in DataHub."""
|
|
250
|
+
# Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
|
|
251
|
+
assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
|
|
252
|
+
f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create a new entity ID with correct default state instead of mutating
|
|
256
|
+
default_entity_id = LakeFormationTagPlatformResourceId(
|
|
257
|
+
tag_key=entity_id.tag_key,
|
|
258
|
+
tag_value=entity_id.tag_value,
|
|
259
|
+
platform_instance=entity_id.platform_instance,
|
|
260
|
+
catalog=entity_id.catalog,
|
|
261
|
+
exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
|
|
262
|
+
persisted=False, # New entities are not persisted yet
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return cls(
|
|
266
|
+
id=default_entity_id,
|
|
267
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
268
|
+
managed_by_datahub=managed_by_datahub,
|
|
269
|
+
allowed_values=None,
|
|
270
|
+
)
|
|
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
61
61
|
def get_blob_service_client(self):
|
|
62
62
|
return BlobServiceClient(
|
|
63
63
|
account_url=f"https://{self.account_name}.blob.core.windows.net",
|
|
64
|
-
credential=
|
|
64
|
+
credential=self.get_credentials(),
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
def get_data_lake_service_client(self) -> DataLakeServiceClient:
|
|
68
68
|
return DataLakeServiceClient(
|
|
69
69
|
account_url=f"https://{self.account_name}.dfs.core.windows.net",
|
|
70
|
-
credential=
|
|
70
|
+
credential=self.get_credentials(),
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
def get_credentials(
|
|
@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
81
81
|
)
|
|
82
82
|
return self.sas_token if self.sas_token is not None else self.account_key
|
|
83
83
|
|
|
84
|
-
@root_validator()
|
|
84
|
+
@root_validator(skip_on_failure=True)
|
|
85
85
|
def _check_credential_values(cls, values: Dict) -> Dict:
|
|
86
86
|
if (
|
|
87
87
|
values.get("account_key")
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterable, List, Optional
|
|
6
6
|
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
9
|
from datahub.ingestion.api.decorators import (
|
|
9
10
|
SupportStatus,
|
|
@@ -44,9 +45,11 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
|
|
|
44
45
|
BigQueryQueriesExtractorConfig,
|
|
45
46
|
)
|
|
46
47
|
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
|
|
48
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
48
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
49
51
|
RedundantLineageRunSkipHandler,
|
|
52
|
+
RedundantQueriesRunSkipHandler,
|
|
50
53
|
RedundantUsageRunSkipHandler,
|
|
51
54
|
)
|
|
52
55
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -77,7 +80,14 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
77
80
|
supported=False,
|
|
78
81
|
)
|
|
79
82
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
80
|
-
@capability(
|
|
83
|
+
@capability(
|
|
84
|
+
SourceCapability.CONTAINERS,
|
|
85
|
+
"Enabled by default",
|
|
86
|
+
subtype_modifier=[
|
|
87
|
+
SourceCapabilityModifier.BIGQUERY_PROJECT,
|
|
88
|
+
SourceCapabilityModifier.BIGQUERY_DATASET,
|
|
89
|
+
],
|
|
90
|
+
)
|
|
81
91
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
82
92
|
@capability(
|
|
83
93
|
SourceCapability.DATA_PROFILING,
|
|
@@ -99,6 +109,7 @@ def cleanup(config: BigQueryV2Config) -> None:
|
|
|
99
109
|
SourceCapability.PARTITION_SUPPORT,
|
|
100
110
|
"Enabled by default, partition keys and clustering keys are supported.",
|
|
101
111
|
)
|
|
112
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
102
113
|
class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
103
114
|
def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
|
|
104
115
|
super().__init__(config, ctx)
|
|
@@ -135,7 +146,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
135
146
|
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
136
147
|
None
|
|
137
148
|
)
|
|
138
|
-
if
|
|
149
|
+
if (
|
|
150
|
+
self.config.enable_stateful_lineage_ingestion
|
|
151
|
+
and not self.config.use_queries_v2
|
|
152
|
+
):
|
|
139
153
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
140
154
|
source=self,
|
|
141
155
|
config=self.config,
|
|
@@ -241,7 +255,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
241
255
|
).workunit_processor,
|
|
242
256
|
]
|
|
243
257
|
|
|
258
|
+
def _warn_deprecated_configs(self):
|
|
259
|
+
if (
|
|
260
|
+
self.config.match_fully_qualified_names is not None
|
|
261
|
+
and not self.config.match_fully_qualified_names
|
|
262
|
+
and self.config.schema_pattern is not None
|
|
263
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
264
|
+
):
|
|
265
|
+
self.report.report_warning(
|
|
266
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
267
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
268
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
269
|
+
context="Config option deprecation warning",
|
|
270
|
+
title="Config option deprecation warning",
|
|
271
|
+
)
|
|
272
|
+
|
|
244
273
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
274
|
+
self._warn_deprecated_configs()
|
|
245
275
|
projects = get_projects(
|
|
246
276
|
self.bq_schema_extractor.schema_api,
|
|
247
277
|
self.report,
|
|
@@ -270,8 +300,20 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
300
|
):
|
|
271
301
|
return
|
|
272
302
|
|
|
273
|
-
|
|
274
|
-
|
|
303
|
+
redundant_queries_run_skip_handler: Optional[
|
|
304
|
+
RedundantQueriesRunSkipHandler
|
|
305
|
+
] = None
|
|
306
|
+
if self.config.enable_stateful_time_window:
|
|
307
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
308
|
+
source=self,
|
|
309
|
+
config=self.config,
|
|
310
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
311
|
+
run_id=self.ctx.run_id,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
with (
|
|
315
|
+
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
316
|
+
BigQueryQueriesExtractor(
|
|
275
317
|
connection=self.config.get_bigquery_client(),
|
|
276
318
|
schema_api=self.bq_schema_extractor.schema_api,
|
|
277
319
|
config=BigQueryQueriesExtractorConfig(
|
|
@@ -288,11 +330,13 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
288
330
|
structured_report=self.report,
|
|
289
331
|
filters=self.filters,
|
|
290
332
|
identifiers=self.identifiers,
|
|
333
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
291
334
|
schema_resolver=self.sql_parser_schema_resolver,
|
|
292
335
|
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
293
|
-
) as queries_extractor
|
|
294
|
-
|
|
295
|
-
|
|
336
|
+
) as queries_extractor,
|
|
337
|
+
):
|
|
338
|
+
self.report.queries_extractor = queries_extractor.report
|
|
339
|
+
yield from queries_extractor.get_workunits_internal()
|
|
296
340
|
else:
|
|
297
341
|
if self.config.include_usage_statistics:
|
|
298
342
|
yield from self.usage_extractor.get_usage_workunits(
|