acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from tableauserverclient import Server, UserItem
|
|
4
5
|
|
|
@@ -10,6 +11,7 @@ class UserInfo:
|
|
|
10
11
|
user_name: str
|
|
11
12
|
site_role: str
|
|
12
13
|
site_id: str
|
|
14
|
+
email: Optional[str] = None
|
|
13
15
|
|
|
14
16
|
def has_site_administrator_explorer_privileges(self):
|
|
15
17
|
return self.site_role in [
|
|
@@ -34,4 +36,5 @@ class UserInfo:
|
|
|
34
36
|
user_name=user.name,
|
|
35
37
|
site_role=user.site_role,
|
|
36
38
|
site_id=server.site_id,
|
|
39
|
+
email=user.email,
|
|
37
40
|
)
|
|
@@ -24,7 +24,7 @@ def check_user_role(
|
|
|
24
24
|
mitigation_message_prefix: str = (
|
|
25
25
|
"Assign `Site Administrator Explorer` role to the user"
|
|
26
26
|
)
|
|
27
|
-
mitigation_message_suffix: str = "Refer to the setup guide: https://
|
|
27
|
+
mitigation_message_suffix: str = "Refer to the setup guide: https://docs.datahub.com/docs/quick-ingestion-guides/tableau/setup"
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
# TODO: Add check for `Enable Derived Permissions`
|
|
@@ -2,22 +2,25 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from datetime import datetime, timedelta, timezone
|
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
|
5
|
-
from urllib.parse import urlparse
|
|
6
5
|
|
|
7
6
|
import pydantic
|
|
8
7
|
from pydantic import Field
|
|
9
8
|
from typing_extensions import Literal
|
|
10
9
|
|
|
11
|
-
from datahub.configuration.common import
|
|
10
|
+
from datahub.configuration.common import (
|
|
11
|
+
AllowDenyPattern,
|
|
12
|
+
ConfigEnum,
|
|
13
|
+
ConfigModel,
|
|
14
|
+
HiddenFromDocs,
|
|
15
|
+
)
|
|
12
16
|
from datahub.configuration.source_common import (
|
|
13
17
|
DatasetSourceConfigMixin,
|
|
14
18
|
LowerCaseDatasetUrnConfigMixin,
|
|
15
19
|
)
|
|
16
20
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
17
21
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
18
|
-
from datahub.ingestion.source.ge_data_profiler import DATABRICKS
|
|
19
22
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
20
|
-
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
23
|
+
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
21
24
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
22
25
|
StatefulStaleMetadataRemovalConfig,
|
|
23
26
|
)
|
|
@@ -25,6 +28,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
25
28
|
StatefulIngestionConfigBase,
|
|
26
29
|
StatefulProfilingConfigMixin,
|
|
27
30
|
)
|
|
31
|
+
from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
|
|
28
32
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
29
33
|
from datahub.ingestion.source_config.operation_config import (
|
|
30
34
|
OperationConfig,
|
|
@@ -34,6 +38,22 @@ from datahub.utilities.global_warning_util import add_global_warning
|
|
|
34
38
|
|
|
35
39
|
logger = logging.getLogger(__name__)
|
|
36
40
|
|
|
41
|
+
# Configuration default constants
|
|
42
|
+
INCLUDE_TAGS_DEFAULT = True
|
|
43
|
+
INCLUDE_HIVE_METASTORE_DEFAULT = True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LineageDataSource(ConfigEnum):
|
|
47
|
+
AUTO = "AUTO"
|
|
48
|
+
SYSTEM_TABLES = "SYSTEM_TABLES"
|
|
49
|
+
API = "API"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class UsageDataSource(ConfigEnum):
|
|
53
|
+
AUTO = "AUTO"
|
|
54
|
+
SYSTEM_TABLES = "SYSTEM_TABLES"
|
|
55
|
+
API = "API"
|
|
56
|
+
|
|
37
57
|
|
|
38
58
|
class UnityCatalogProfilerConfig(ConfigModel):
|
|
39
59
|
method: str = Field(
|
|
@@ -117,6 +137,7 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
|
|
|
117
137
|
|
|
118
138
|
|
|
119
139
|
class UnityCatalogSourceConfig(
|
|
140
|
+
UnityCatalogConnectionConfig,
|
|
120
141
|
SQLCommonConfig,
|
|
121
142
|
StatefulIngestionConfigBase,
|
|
122
143
|
BaseUsageConfig,
|
|
@@ -124,23 +145,6 @@ class UnityCatalogSourceConfig(
|
|
|
124
145
|
StatefulProfilingConfigMixin,
|
|
125
146
|
LowerCaseDatasetUrnConfigMixin,
|
|
126
147
|
):
|
|
127
|
-
token: str = pydantic.Field(description="Databricks personal access token")
|
|
128
|
-
workspace_url: str = pydantic.Field(
|
|
129
|
-
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
|
130
|
-
)
|
|
131
|
-
warehouse_id: Optional[str] = pydantic.Field(
|
|
132
|
-
default=None,
|
|
133
|
-
description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
|
|
134
|
-
)
|
|
135
|
-
include_hive_metastore: bool = pydantic.Field(
|
|
136
|
-
default=True,
|
|
137
|
-
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
|
138
|
-
)
|
|
139
|
-
workspace_name: Optional[str] = pydantic.Field(
|
|
140
|
-
default=None,
|
|
141
|
-
description="Name of the workspace. Default to deployment name present in workspace_url",
|
|
142
|
-
)
|
|
143
|
-
|
|
144
148
|
include_metastore: bool = pydantic.Field(
|
|
145
149
|
default=False,
|
|
146
150
|
description=(
|
|
@@ -228,6 +232,15 @@ class UnityCatalogSourceConfig(
|
|
|
228
232
|
description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
|
|
229
233
|
)
|
|
230
234
|
|
|
235
|
+
include_tags: bool = pydantic.Field(
|
|
236
|
+
default=INCLUDE_TAGS_DEFAULT,
|
|
237
|
+
description=(
|
|
238
|
+
"Option to enable/disable column/table tag extraction. "
|
|
239
|
+
"Requires warehouse_id to be set since tag extraction needs to query system.information_schema.tags. "
|
|
240
|
+
"If warehouse_id is not provided, this will be automatically disabled to allow ingestion to continue."
|
|
241
|
+
),
|
|
242
|
+
)
|
|
243
|
+
|
|
231
244
|
_rename_table_ownership = pydantic_renamed_field(
|
|
232
245
|
"include_table_ownership", "include_ownership"
|
|
233
246
|
)
|
|
@@ -237,15 +250,40 @@ class UnityCatalogSourceConfig(
|
|
|
237
250
|
description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
|
|
238
251
|
)
|
|
239
252
|
|
|
253
|
+
lineage_data_source: LineageDataSource = pydantic.Field(
|
|
254
|
+
default=LineageDataSource.AUTO,
|
|
255
|
+
description=(
|
|
256
|
+
"Source for lineage data extraction. Options: "
|
|
257
|
+
f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
|
|
258
|
+
f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
|
|
259
|
+
f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
|
|
260
|
+
),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
ignore_start_time_lineage: bool = pydantic.Field(
|
|
264
|
+
default=False,
|
|
265
|
+
description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
|
|
266
|
+
)
|
|
267
|
+
|
|
240
268
|
column_lineage_column_limit: int = pydantic.Field(
|
|
241
269
|
default=300,
|
|
242
270
|
description="Limit the number of columns to get column level lineage. ",
|
|
243
271
|
)
|
|
244
272
|
|
|
245
|
-
lineage_max_workers: int = pydantic.Field(
|
|
273
|
+
lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
|
|
246
274
|
default=5 * (os.cpu_count() or 4),
|
|
247
275
|
description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
|
|
248
|
-
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
databricks_api_page_size: int = pydantic.Field(
|
|
279
|
+
default=0,
|
|
280
|
+
ge=0,
|
|
281
|
+
description=(
|
|
282
|
+
"Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
|
|
283
|
+
"When set to 0 (default), uses server-side configured page length (recommended). "
|
|
284
|
+
"When set to a positive value, the page length is the minimum of this value and the server configured value. "
|
|
285
|
+
"Must be a non-negative integer."
|
|
286
|
+
),
|
|
249
287
|
)
|
|
250
288
|
|
|
251
289
|
include_usage_statistics: bool = Field(
|
|
@@ -253,6 +291,17 @@ class UnityCatalogSourceConfig(
|
|
|
253
291
|
description="Generate usage statistics.",
|
|
254
292
|
)
|
|
255
293
|
|
|
294
|
+
usage_data_source: UsageDataSource = pydantic.Field(
|
|
295
|
+
default=UsageDataSource.AUTO,
|
|
296
|
+
description=(
|
|
297
|
+
"Source for usage/query history data extraction. Options: "
|
|
298
|
+
f"'{UsageDataSource.AUTO.value}' (default) - Automatically use system.query.history table when SQL warehouse is configured, otherwise fall back to REST API. "
|
|
299
|
+
"This provides better performance for multi-workspace setups and large query volumes when warehouse_id is set. "
|
|
300
|
+
f"'{UsageDataSource.SYSTEM_TABLES.value}' - Force use of system.query.history table (requires SQL warehouse and SELECT permission on system.query.history). "
|
|
301
|
+
f"'{UsageDataSource.API.value}' - Force use of REST API endpoints for query history (legacy method, may have limitations with multiple workspaces)."
|
|
302
|
+
),
|
|
303
|
+
)
|
|
304
|
+
|
|
256
305
|
# TODO: Remove `type:ignore` by refactoring config
|
|
257
306
|
profiling: Union[
|
|
258
307
|
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
@@ -272,19 +321,68 @@ class UnityCatalogSourceConfig(
|
|
|
272
321
|
description="Details about the delta lake, incase to emit siblings",
|
|
273
322
|
)
|
|
274
323
|
|
|
275
|
-
|
|
324
|
+
include_ml_model_aliases: bool = pydantic.Field(
|
|
325
|
+
default=False,
|
|
326
|
+
description="Whether to include ML model aliases in the ingestion.",
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
ml_model_max_results: int = pydantic.Field(
|
|
330
|
+
default=1000,
|
|
331
|
+
ge=0,
|
|
332
|
+
description="Maximum number of ML models to ingest.",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
_forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
|
|
336
|
+
_forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
|
|
337
|
+
|
|
338
|
+
include_hive_metastore: bool = pydantic.Field(
|
|
339
|
+
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
|
340
|
+
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
|
341
|
+
)
|
|
276
342
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
343
|
+
workspace_name: Optional[str] = pydantic.Field(
|
|
344
|
+
default=None,
|
|
345
|
+
description="Name of the workspace. Default to deployment name present in workspace_url",
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
def __init__(self, **data):
|
|
349
|
+
# First, let the parent handle the root validators and field processing
|
|
350
|
+
super().__init__(**data)
|
|
351
|
+
|
|
352
|
+
# After model creation, check if we need to auto-disable features
|
|
353
|
+
# based on the final warehouse_id value (which may have been set by root validators)
|
|
354
|
+
include_tags_original = data.get("include_tags", INCLUDE_TAGS_DEFAULT)
|
|
355
|
+
include_hive_metastore_original = data.get(
|
|
356
|
+
"include_hive_metastore", INCLUDE_HIVE_METASTORE_DEFAULT
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# Track what we're force-disabling
|
|
360
|
+
forced_disable_tag_extraction = False
|
|
361
|
+
forced_disable_hive_metastore_extraction = False
|
|
362
|
+
|
|
363
|
+
# Check if features should be auto-disabled based on final warehouse_id
|
|
364
|
+
if include_tags_original and not self.warehouse_id:
|
|
365
|
+
forced_disable_tag_extraction = True
|
|
366
|
+
self.include_tags = False # Modify the model attribute directly
|
|
367
|
+
logger.warning(
|
|
368
|
+
"warehouse_id is not set but include_tags=True. "
|
|
369
|
+
"Automatically disabling tag extraction since it requires SQL queries. "
|
|
370
|
+
"Set warehouse_id to enable tag extraction."
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if include_hive_metastore_original and not self.warehouse_id:
|
|
374
|
+
forced_disable_hive_metastore_extraction = True
|
|
375
|
+
self.include_hive_metastore = False # Modify the model attribute directly
|
|
376
|
+
logger.warning(
|
|
377
|
+
"warehouse_id is not set but include_hive_metastore=True. "
|
|
378
|
+
"Automatically disabling hive metastore extraction since it requires SQL queries. "
|
|
379
|
+
"Set warehouse_id to enable hive metastore extraction."
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Set private attributes
|
|
383
|
+
self._forced_disable_tag_extraction = forced_disable_tag_extraction
|
|
384
|
+
self._forced_disable_hive_metastore_extraction = (
|
|
385
|
+
forced_disable_hive_metastore_extraction
|
|
288
386
|
)
|
|
289
387
|
|
|
290
388
|
def is_profiling_enabled(self) -> bool:
|
|
@@ -343,11 +441,6 @@ class UnityCatalogSourceConfig(
|
|
|
343
441
|
"When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
|
|
344
442
|
)
|
|
345
443
|
|
|
346
|
-
if values.get("include_hive_metastore") and not values.get("warehouse_id"):
|
|
347
|
-
raise ValueError(
|
|
348
|
-
"When `include_hive_metastore` is set, `warehouse_id` must be set."
|
|
349
|
-
)
|
|
350
|
-
|
|
351
444
|
if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
|
|
352
445
|
profiling.warehouse_id = values["warehouse_id"]
|
|
353
446
|
|
|
@@ -356,6 +449,34 @@ class UnityCatalogSourceConfig(
|
|
|
356
449
|
|
|
357
450
|
return values
|
|
358
451
|
|
|
452
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
453
|
+
def validate_lineage_data_source_with_warehouse(
|
|
454
|
+
cls, values: Dict[str, Any]
|
|
455
|
+
) -> Dict[str, Any]:
|
|
456
|
+
lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
|
|
457
|
+
warehouse_id = values.get("warehouse_id")
|
|
458
|
+
|
|
459
|
+
if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
460
|
+
raise ValueError(
|
|
461
|
+
f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
return values
|
|
465
|
+
|
|
466
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
467
|
+
def validate_usage_data_source_with_warehouse(
|
|
468
|
+
cls, values: Dict[str, Any]
|
|
469
|
+
) -> Dict[str, Any]:
|
|
470
|
+
usage_data_source = values.get("usage_data_source", UsageDataSource.AUTO)
|
|
471
|
+
warehouse_id = values.get("warehouse_id")
|
|
472
|
+
|
|
473
|
+
if usage_data_source == UsageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
474
|
+
raise ValueError(
|
|
475
|
+
f"usage_data_source='{UsageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return values
|
|
479
|
+
|
|
359
480
|
@pydantic.validator("schema_pattern", always=True)
|
|
360
481
|
def schema_pattern_should__always_deny_information_schema(
|
|
361
482
|
cls, v: AllowDenyPattern
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Databricks Unity Catalog connection configuration."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
|
|
9
|
+
from datahub.configuration.common import ConfigModel
|
|
10
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
11
|
+
|
|
12
|
+
DATABRICKS = "databricks"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnityCatalogConnectionConfig(ConfigModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for connecting to Databricks Unity Catalog.
|
|
18
|
+
Contains only connection-related fields that can be reused across different sources.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
scheme: str = DATABRICKS
|
|
22
|
+
token: str = pydantic.Field(description="Databricks personal access token")
|
|
23
|
+
workspace_url: str = pydantic.Field(
|
|
24
|
+
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
|
25
|
+
)
|
|
26
|
+
warehouse_id: Optional[str] = pydantic.Field(
|
|
27
|
+
default=None,
|
|
28
|
+
description=(
|
|
29
|
+
"SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
|
|
30
|
+
"Required for the following features that need SQL access: "
|
|
31
|
+
"1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
|
|
32
|
+
"2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
|
|
33
|
+
"3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
|
|
34
|
+
"4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
|
|
35
|
+
"When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
extra_client_options: Dict[str, Any] = Field(
|
|
40
|
+
default={},
|
|
41
|
+
description="Additional options to pass to Databricks SQLAlchemy client.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def __init__(self, **data: Any):
|
|
45
|
+
super().__init__(**data)
|
|
46
|
+
|
|
47
|
+
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
48
|
+
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
|
49
|
+
if database:
|
|
50
|
+
uri_opts["catalog"] = database
|
|
51
|
+
return make_sqlalchemy_uri(
|
|
52
|
+
scheme=self.scheme,
|
|
53
|
+
username="token",
|
|
54
|
+
password=self.token,
|
|
55
|
+
at=urlparse(self.workspace_url).netloc,
|
|
56
|
+
db=database,
|
|
57
|
+
uri_opts=uri_opts,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_options(self) -> dict:
|
|
61
|
+
return self.extra_client_options
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from datahub.api.entities.external.external_entities import (
|
|
4
|
+
PlatformResourceRepository,
|
|
5
|
+
)
|
|
6
|
+
from datahub.ingestion.source.unity.tag_entities import (
|
|
7
|
+
UnityCatalogTagPlatformResource,
|
|
8
|
+
UnityCatalogTagPlatformResourceId,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnityCatalogPlatformResourceRepository(
|
|
15
|
+
PlatformResourceRepository[
|
|
16
|
+
UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
|
|
17
|
+
]
|
|
18
|
+
):
|
|
19
|
+
"""Unity Catalog-specific platform resource repository with tag-related operations."""
|