acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,16 +4,22 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
|
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
9
|
from datetime import datetime
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
10
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
|
|
9
11
|
from unittest.mock import patch
|
|
10
12
|
|
|
13
|
+
import cachetools
|
|
14
|
+
from cachetools import cached
|
|
11
15
|
from databricks.sdk import WorkspaceClient
|
|
12
16
|
from databricks.sdk.service.catalog import (
|
|
13
17
|
CatalogInfo,
|
|
14
18
|
ColumnInfo,
|
|
15
19
|
GetMetastoreSummaryResponse,
|
|
16
20
|
MetastoreInfo,
|
|
21
|
+
ModelVersionInfo,
|
|
22
|
+
RegisteredModelInfo,
|
|
17
23
|
SchemaInfo,
|
|
18
24
|
TableInfo,
|
|
19
25
|
)
|
|
@@ -25,9 +31,17 @@ from databricks.sdk.service.sql import (
|
|
|
25
31
|
QueryStatus,
|
|
26
32
|
)
|
|
27
33
|
from databricks.sdk.service.workspace import ObjectType
|
|
34
|
+
from databricks.sql import connect
|
|
35
|
+
from databricks.sql.types import Row
|
|
36
|
+
from typing_extensions import assert_never
|
|
28
37
|
|
|
29
38
|
from datahub._version import nice_version_name
|
|
39
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
30
40
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
41
|
+
from datahub.ingestion.source.unity.config import (
|
|
42
|
+
LineageDataSource,
|
|
43
|
+
UsageDataSource,
|
|
44
|
+
)
|
|
31
45
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
32
46
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
33
47
|
UnityCatalogProxyProfilingMixin,
|
|
@@ -39,7 +53,10 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
39
53
|
CustomCatalogType,
|
|
40
54
|
ExternalTableReference,
|
|
41
55
|
Metastore,
|
|
56
|
+
Model,
|
|
57
|
+
ModelVersion,
|
|
42
58
|
Notebook,
|
|
59
|
+
NotebookReference,
|
|
43
60
|
Query,
|
|
44
61
|
Schema,
|
|
45
62
|
ServicePrincipal,
|
|
@@ -47,9 +64,31 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
47
64
|
TableReference,
|
|
48
65
|
)
|
|
49
66
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
|
67
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
50
68
|
|
|
51
69
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
52
70
|
|
|
71
|
+
# It is enough to keep the cache size to 1, since we only process one catalog at a time
|
|
72
|
+
# We need to change this if we want to support parallel processing of multiple catalogs
|
|
73
|
+
_MAX_CONCURRENT_CATALOGS = 1
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Import and apply the proxy patch from separate module
|
|
77
|
+
try:
|
|
78
|
+
from datahub.ingestion.source.unity.proxy_patch import (
|
|
79
|
+
apply_databricks_proxy_fix,
|
|
80
|
+
mask_proxy_credentials,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Apply the fix when the module is imported
|
|
84
|
+
apply_databricks_proxy_fix()
|
|
85
|
+
except ImportError as e:
|
|
86
|
+
logger.debug(f"Could not import proxy patch module: {e}")
|
|
87
|
+
|
|
88
|
+
# Fallback function for masking credentials
|
|
89
|
+
def mask_proxy_credentials(url: Optional[str]) -> str:
|
|
90
|
+
return "***MASKED***" if url else "None"
|
|
91
|
+
|
|
53
92
|
|
|
54
93
|
@dataclasses.dataclass
|
|
55
94
|
class TableInfoWithGeneration(TableInfo):
|
|
@@ -85,6 +124,32 @@ class QueryFilterWithStatementTypes(QueryFilter):
|
|
|
85
124
|
return v
|
|
86
125
|
|
|
87
126
|
|
|
127
|
+
@dataclasses.dataclass
|
|
128
|
+
class TableUpstream:
|
|
129
|
+
table_name: str
|
|
130
|
+
source_type: str
|
|
131
|
+
last_updated: Optional[datetime] = None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclasses.dataclass
|
|
135
|
+
class ExternalUpstream:
|
|
136
|
+
path: str
|
|
137
|
+
source_type: str
|
|
138
|
+
last_updated: Optional[datetime] = None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclasses.dataclass
|
|
142
|
+
class TableLineageInfo:
|
|
143
|
+
upstreams: List[TableUpstream] = dataclasses.field(default_factory=list)
|
|
144
|
+
external_upstreams: List[ExternalUpstream] = dataclasses.field(default_factory=list)
|
|
145
|
+
upstream_notebooks: List[NotebookReference] = dataclasses.field(
|
|
146
|
+
default_factory=list
|
|
147
|
+
)
|
|
148
|
+
downstream_notebooks: List[NotebookReference] = dataclasses.field(
|
|
149
|
+
default_factory=list
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
88
153
|
class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
89
154
|
_workspace_client: WorkspaceClient
|
|
90
155
|
_workspace_url: str
|
|
@@ -98,6 +163,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
98
163
|
warehouse_id: Optional[str],
|
|
99
164
|
report: UnityCatalogReport,
|
|
100
165
|
hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
|
|
166
|
+
lineage_data_source: LineageDataSource = LineageDataSource.AUTO,
|
|
167
|
+
usage_data_source: UsageDataSource = UsageDataSource.AUTO,
|
|
168
|
+
databricks_api_page_size: int = 0,
|
|
101
169
|
):
|
|
102
170
|
self._workspace_client = WorkspaceClient(
|
|
103
171
|
host=workspace_url,
|
|
@@ -108,9 +176,24 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
108
176
|
self.warehouse_id = warehouse_id or ""
|
|
109
177
|
self.report = report
|
|
110
178
|
self.hive_metastore_proxy = hive_metastore_proxy
|
|
179
|
+
self.lineage_data_source = lineage_data_source
|
|
180
|
+
self.usage_data_source = usage_data_source
|
|
181
|
+
self.databricks_api_page_size = databricks_api_page_size
|
|
182
|
+
self._sql_connection_params = {
|
|
183
|
+
"server_hostname": self._workspace_client.config.host.replace(
|
|
184
|
+
"https://", ""
|
|
185
|
+
),
|
|
186
|
+
"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
|
|
187
|
+
"access_token": self._workspace_client.config.token,
|
|
188
|
+
"user_agent_entry": "datahub",
|
|
189
|
+
}
|
|
111
190
|
|
|
112
191
|
def check_basic_connectivity(self) -> bool:
|
|
113
|
-
return bool(
|
|
192
|
+
return bool(
|
|
193
|
+
self._workspace_client.catalogs.list(
|
|
194
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
195
|
+
)
|
|
196
|
+
)
|
|
114
197
|
|
|
115
198
|
def assigned_metastore(self) -> Optional[Metastore]:
|
|
116
199
|
response = self._workspace_client.metastores.summary()
|
|
@@ -120,7 +203,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
120
203
|
if self.hive_metastore_proxy:
|
|
121
204
|
yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
|
|
122
205
|
|
|
123
|
-
response = self._workspace_client.catalogs.list(
|
|
206
|
+
response = self._workspace_client.catalogs.list(
|
|
207
|
+
include_browse=True, max_results=self.databricks_api_page_size
|
|
208
|
+
)
|
|
124
209
|
if not response:
|
|
125
210
|
logger.info("Catalogs not found")
|
|
126
211
|
return
|
|
@@ -152,7 +237,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
152
237
|
yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
|
|
153
238
|
return
|
|
154
239
|
response = self._workspace_client.schemas.list(
|
|
155
|
-
catalog_name=catalog.name,
|
|
240
|
+
catalog_name=catalog.name,
|
|
241
|
+
include_browse=True,
|
|
242
|
+
max_results=self.databricks_api_page_size,
|
|
156
243
|
)
|
|
157
244
|
if not response:
|
|
158
245
|
logger.info(f"Schemas not found for catalog {catalog.id}")
|
|
@@ -174,6 +261,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
174
261
|
catalog_name=schema.catalog.name,
|
|
175
262
|
schema_name=schema.name,
|
|
176
263
|
include_browse=True,
|
|
264
|
+
max_results=self.databricks_api_page_size,
|
|
177
265
|
)
|
|
178
266
|
if not response:
|
|
179
267
|
logger.info(f"Tables not found for schema {schema.id}")
|
|
@@ -189,6 +277,40 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
189
277
|
logger.warning(f"Error parsing table: {e}")
|
|
190
278
|
self.report.report_warning("table-parse", str(e))
|
|
191
279
|
|
|
280
|
+
def ml_models(
|
|
281
|
+
self, schema: Schema, max_results: Optional[int] = None
|
|
282
|
+
) -> Iterable[Model]:
|
|
283
|
+
response = self._workspace_client.registered_models.list(
|
|
284
|
+
catalog_name=schema.catalog.name,
|
|
285
|
+
schema_name=schema.name,
|
|
286
|
+
max_results=max_results,
|
|
287
|
+
)
|
|
288
|
+
for ml_model in response:
|
|
289
|
+
optional_ml_model = self._create_ml_model(schema, ml_model)
|
|
290
|
+
if optional_ml_model:
|
|
291
|
+
yield optional_ml_model
|
|
292
|
+
|
|
293
|
+
def ml_model_versions(
|
|
294
|
+
self, ml_model: Model, include_aliases: bool = False
|
|
295
|
+
) -> Iterable[ModelVersion]:
|
|
296
|
+
response = self._workspace_client.model_versions.list(
|
|
297
|
+
full_name=ml_model.id,
|
|
298
|
+
include_browse=True,
|
|
299
|
+
max_results=self.databricks_api_page_size,
|
|
300
|
+
)
|
|
301
|
+
for version in response:
|
|
302
|
+
if version.version is not None:
|
|
303
|
+
if include_aliases:
|
|
304
|
+
# to get aliases info, use GET
|
|
305
|
+
version = self._workspace_client.model_versions.get(
|
|
306
|
+
ml_model.id, version.version, include_aliases=True
|
|
307
|
+
)
|
|
308
|
+
optional_ml_model_version = self._create_ml_model_version(
|
|
309
|
+
ml_model, version
|
|
310
|
+
)
|
|
311
|
+
if optional_ml_model_version:
|
|
312
|
+
yield optional_ml_model_version
|
|
313
|
+
|
|
192
314
|
def service_principals(self) -> Iterable[ServicePrincipal]:
|
|
193
315
|
for principal in self._workspace_client.service_principals.list():
|
|
194
316
|
optional_sp = self._create_service_principal(principal)
|
|
@@ -206,7 +328,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
206
328
|
return group_list
|
|
207
329
|
|
|
208
330
|
def workspace_notebooks(self) -> Iterable[Notebook]:
|
|
209
|
-
|
|
331
|
+
workspace_objects_iter = self._workspace_client.workspace.list(
|
|
332
|
+
"/", recursive=True, max_results=self.databricks_api_page_size
|
|
333
|
+
)
|
|
334
|
+
for obj in workspace_objects_iter:
|
|
210
335
|
if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path:
|
|
211
336
|
yield Notebook(
|
|
212
337
|
id=obj.object_id,
|
|
@@ -248,7 +373,6 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
248
373
|
def _query_history(
|
|
249
374
|
self,
|
|
250
375
|
filter_by: QueryFilterWithStatementTypes,
|
|
251
|
-
max_results: int = 1000,
|
|
252
376
|
include_metrics: bool = False,
|
|
253
377
|
) -> Iterable[QueryInfo]:
|
|
254
378
|
"""Manual implementation of the query_history.list() endpoint.
|
|
@@ -260,9 +384,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
260
384
|
"""
|
|
261
385
|
method = "GET"
|
|
262
386
|
path = "/api/2.0/sql/history/queries"
|
|
387
|
+
|
|
263
388
|
body: Dict[str, Any] = {
|
|
264
389
|
"include_metrics": include_metrics,
|
|
265
|
-
"max_results":
|
|
390
|
+
"max_results": self.databricks_api_page_size, # Max batch size
|
|
266
391
|
}
|
|
267
392
|
|
|
268
393
|
response: dict = self._workspace_client.api_client.do( # type: ignore
|
|
@@ -280,10 +405,257 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
280
405
|
method, path, body={**body, "page_token": response["next_page_token"]}
|
|
281
406
|
)
|
|
282
407
|
|
|
283
|
-
def
|
|
408
|
+
def get_query_history_via_system_tables(
|
|
409
|
+
self,
|
|
410
|
+
start_time: datetime,
|
|
411
|
+
end_time: datetime,
|
|
412
|
+
) -> Iterable[Query]:
|
|
413
|
+
"""Get query history using system.query.history table.
|
|
414
|
+
|
|
415
|
+
This method provides an alternative to the REST API for fetching query history,
|
|
416
|
+
offering better performance and richer data for large query volumes.
|
|
417
|
+
"""
|
|
418
|
+
logger.info(
|
|
419
|
+
f"Fetching query history from system.query.history for period: {start_time} to {end_time}"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
allowed_types = [typ.value for typ in ALLOWED_STATEMENT_TYPES]
|
|
423
|
+
statement_type_filter = ", ".join(f"'{typ}'" for typ in allowed_types)
|
|
424
|
+
|
|
425
|
+
query = f"""
|
|
426
|
+
SELECT
|
|
427
|
+
statement_id,
|
|
428
|
+
statement_text,
|
|
429
|
+
statement_type,
|
|
430
|
+
start_time,
|
|
431
|
+
end_time,
|
|
432
|
+
executed_by,
|
|
433
|
+
executed_as,
|
|
434
|
+
executed_by_user_id,
|
|
435
|
+
executed_as_user_id
|
|
436
|
+
FROM system.query.history
|
|
437
|
+
WHERE
|
|
438
|
+
start_time >= %s
|
|
439
|
+
AND end_time <= %s
|
|
440
|
+
AND execution_status = 'FINISHED'
|
|
441
|
+
AND statement_type IN ({statement_type_filter})
|
|
442
|
+
ORDER BY start_time
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
rows = self._execute_sql_query(query, (start_time, end_time))
|
|
447
|
+
for row in rows:
|
|
448
|
+
try:
|
|
449
|
+
yield Query(
|
|
450
|
+
query_id=row.statement_id,
|
|
451
|
+
query_text=row.statement_text,
|
|
452
|
+
statement_type=(
|
|
453
|
+
QueryStatementType(row.statement_type)
|
|
454
|
+
if row.statement_type
|
|
455
|
+
else None
|
|
456
|
+
),
|
|
457
|
+
start_time=row.start_time,
|
|
458
|
+
end_time=row.end_time,
|
|
459
|
+
user_id=row.executed_by_user_id,
|
|
460
|
+
user_name=row.executed_by,
|
|
461
|
+
executed_as_user_id=row.executed_as_user_id,
|
|
462
|
+
executed_as_user_name=row.executed_as,
|
|
463
|
+
)
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.warning(f"Error parsing query from system table: {e}")
|
|
466
|
+
self.report.report_warning("query-parse-system-table", str(e))
|
|
467
|
+
except Exception as e:
|
|
468
|
+
logger.error(
|
|
469
|
+
f"Error fetching query history from system tables: {e}", exc_info=True
|
|
470
|
+
)
|
|
471
|
+
self.report.report_failure(
|
|
472
|
+
title="Failed to fetch query history from system tables",
|
|
473
|
+
message="Error querying system.query.history table",
|
|
474
|
+
context=f"Query period: {start_time} to {end_time}",
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
def _build_datetime_where_conditions(
|
|
478
|
+
self, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None
|
|
479
|
+
) -> str:
|
|
480
|
+
"""Build datetime filtering conditions for lineage queries."""
|
|
481
|
+
conditions = []
|
|
482
|
+
if start_time:
|
|
483
|
+
conditions.append(f"event_time >= '{start_time.isoformat()}'")
|
|
484
|
+
if end_time:
|
|
485
|
+
conditions.append(f"event_time <= '{end_time.isoformat()}'")
|
|
486
|
+
return " AND " + " AND ".join(conditions) if conditions else ""
|
|
487
|
+
|
|
488
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
489
|
+
def get_catalog_table_lineage_via_system_tables(
|
|
490
|
+
self,
|
|
491
|
+
catalog: str,
|
|
492
|
+
start_time: Optional[datetime] = None,
|
|
493
|
+
end_time: Optional[datetime] = None,
|
|
494
|
+
) -> FileBackedDict[TableLineageInfo]:
|
|
495
|
+
"""Get table lineage for all tables in a catalog using system tables."""
|
|
496
|
+
logger.info(f"Fetching table lineage for catalog: {catalog}")
|
|
497
|
+
try:
|
|
498
|
+
additional_where = self._build_datetime_where_conditions(
|
|
499
|
+
start_time, end_time
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
query = f"""
|
|
503
|
+
SELECT
|
|
504
|
+
entity_type, entity_id,
|
|
505
|
+
source_table_full_name, source_type, source_path,
|
|
506
|
+
target_table_full_name, target_type,
|
|
507
|
+
max(event_time) as last_updated
|
|
508
|
+
FROM system.access.table_lineage
|
|
509
|
+
WHERE
|
|
510
|
+
(target_table_catalog = %s or source_table_catalog = %s)
|
|
511
|
+
{additional_where}
|
|
512
|
+
GROUP BY
|
|
513
|
+
entity_type, entity_id,
|
|
514
|
+
source_table_full_name, source_type, source_path,
|
|
515
|
+
target_table_full_name, target_type
|
|
516
|
+
"""
|
|
517
|
+
rows = self._execute_sql_query(query, [catalog, catalog])
|
|
518
|
+
|
|
519
|
+
result_dict: FileBackedDict[TableLineageInfo] = FileBackedDict()
|
|
520
|
+
for row in rows:
|
|
521
|
+
entity_type = row["entity_type"]
|
|
522
|
+
entity_id = row["entity_id"]
|
|
523
|
+
source_full_name = row["source_table_full_name"]
|
|
524
|
+
target_full_name = row["target_table_full_name"]
|
|
525
|
+
source_type = row["source_type"]
|
|
526
|
+
source_path = row["source_path"]
|
|
527
|
+
last_updated = row["last_updated"]
|
|
528
|
+
|
|
529
|
+
# Initialize TableLineageInfo for both source and target tables if they're in our catalog
|
|
530
|
+
for table_name in [source_full_name, target_full_name]:
|
|
531
|
+
if (
|
|
532
|
+
table_name
|
|
533
|
+
and table_name.startswith(f"{catalog}.")
|
|
534
|
+
and table_name not in result_dict
|
|
535
|
+
):
|
|
536
|
+
result_dict[table_name] = TableLineageInfo()
|
|
537
|
+
|
|
538
|
+
# Process upstream relationships (target table gets upstreams)
|
|
539
|
+
if target_full_name and target_full_name.startswith(f"{catalog}."):
|
|
540
|
+
# Handle table upstreams
|
|
541
|
+
if (
|
|
542
|
+
source_type in ["TABLE", "VIEW"]
|
|
543
|
+
and source_full_name != target_full_name
|
|
544
|
+
):
|
|
545
|
+
upstream = TableUpstream(
|
|
546
|
+
table_name=source_full_name,
|
|
547
|
+
source_type=source_type,
|
|
548
|
+
last_updated=last_updated,
|
|
549
|
+
)
|
|
550
|
+
result_dict[target_full_name].upstreams.append(upstream)
|
|
551
|
+
|
|
552
|
+
# Handle external upstreams (PATH type)
|
|
553
|
+
elif source_type == "PATH":
|
|
554
|
+
external_upstream = ExternalUpstream(
|
|
555
|
+
path=source_path,
|
|
556
|
+
source_type=source_type,
|
|
557
|
+
last_updated=last_updated,
|
|
558
|
+
)
|
|
559
|
+
result_dict[target_full_name].external_upstreams.append(
|
|
560
|
+
external_upstream
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Handle upstream notebooks (notebook -> table)
|
|
564
|
+
elif entity_type == "NOTEBOOK":
|
|
565
|
+
notebook_ref = NotebookReference(
|
|
566
|
+
id=entity_id,
|
|
567
|
+
last_updated=last_updated,
|
|
568
|
+
)
|
|
569
|
+
result_dict[target_full_name].upstream_notebooks.append(
|
|
570
|
+
notebook_ref
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Process downstream relationships (source table gets downstream notebooks)
|
|
574
|
+
if (
|
|
575
|
+
entity_type == "NOTEBOOK"
|
|
576
|
+
and source_full_name
|
|
577
|
+
and source_full_name.startswith(f"{catalog}.")
|
|
578
|
+
):
|
|
579
|
+
notebook_ref = NotebookReference(
|
|
580
|
+
id=entity_id,
|
|
581
|
+
last_updated=last_updated,
|
|
582
|
+
)
|
|
583
|
+
result_dict[source_full_name].downstream_notebooks.append(
|
|
584
|
+
notebook_ref
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
return result_dict
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.warning(
|
|
590
|
+
f"Error getting table lineage for catalog {catalog}: {e}",
|
|
591
|
+
exc_info=True,
|
|
592
|
+
)
|
|
593
|
+
return FileBackedDict()
|
|
594
|
+
|
|
595
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
596
|
+
def get_catalog_column_lineage_via_system_tables(
|
|
597
|
+
self,
|
|
598
|
+
catalog: str,
|
|
599
|
+
start_time: Optional[datetime] = None,
|
|
600
|
+
end_time: Optional[datetime] = None,
|
|
601
|
+
) -> FileBackedDict[Dict[str, dict]]:
|
|
602
|
+
"""Get column lineage for all tables in a catalog using system tables."""
|
|
603
|
+
logger.info(f"Fetching column lineage for catalog: {catalog}")
|
|
604
|
+
try:
|
|
605
|
+
additional_where = self._build_datetime_where_conditions(
|
|
606
|
+
start_time, end_time
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
query = f"""
|
|
610
|
+
SELECT
|
|
611
|
+
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
612
|
+
target_table_schema, target_table_name, target_column_name,
|
|
613
|
+
max(event_time) as last_updated
|
|
614
|
+
FROM system.access.column_lineage
|
|
615
|
+
WHERE
|
|
616
|
+
target_table_catalog = %s
|
|
617
|
+
AND target_table_schema IS NOT NULL
|
|
618
|
+
AND target_table_name IS NOT NULL
|
|
619
|
+
AND target_column_name IS NOT NULL
|
|
620
|
+
AND source_table_catalog IS NOT NULL
|
|
621
|
+
AND source_table_schema IS NOT NULL
|
|
622
|
+
AND source_table_name IS NOT NULL
|
|
623
|
+
AND source_column_name IS NOT NULL
|
|
624
|
+
{additional_where}
|
|
625
|
+
GROUP BY
|
|
626
|
+
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
627
|
+
target_table_schema, target_table_name, target_column_name
|
|
628
|
+
"""
|
|
629
|
+
rows = self._execute_sql_query(query, [catalog])
|
|
630
|
+
|
|
631
|
+
result_dict: FileBackedDict[Dict[str, dict]] = FileBackedDict()
|
|
632
|
+
for row in rows:
|
|
633
|
+
result_dict.setdefault(row["target_table_schema"], {}).setdefault(
|
|
634
|
+
row["target_table_name"], {}
|
|
635
|
+
).setdefault(row["target_column_name"], []).append(
|
|
636
|
+
# make fields look like the response from the older HTTP API
|
|
637
|
+
{
|
|
638
|
+
"catalog_name": row["source_table_catalog"],
|
|
639
|
+
"schema_name": row["source_table_schema"],
|
|
640
|
+
"table_name": row["source_table_name"],
|
|
641
|
+
"name": row["source_column_name"],
|
|
642
|
+
"last_updated": row["last_updated"],
|
|
643
|
+
}
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
return result_dict
|
|
647
|
+
except Exception as e:
|
|
648
|
+
logger.warning(
|
|
649
|
+
f"Error getting column lineage for catalog {catalog}: {e}",
|
|
650
|
+
exc_info=True,
|
|
651
|
+
)
|
|
652
|
+
return FileBackedDict()
|
|
653
|
+
|
|
654
|
+
def list_lineages_by_table_via_http_api(
|
|
284
655
|
self, table_name: str, include_entity_lineage: bool
|
|
285
656
|
) -> dict:
|
|
286
657
|
"""List table lineage by table name."""
|
|
658
|
+
logger.debug(f"Getting table lineage for {table_name}")
|
|
287
659
|
return self._workspace_client.api_client.do( # type: ignore
|
|
288
660
|
method="GET",
|
|
289
661
|
path="/api/2.0/lineage-tracking/table-lineage",
|
|
@@ -293,67 +665,226 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
293
665
|
},
|
|
294
666
|
)
|
|
295
667
|
|
|
296
|
-
def
|
|
668
|
+
def list_lineages_by_column_via_http_api(
|
|
669
|
+
self, table_name: str, column_name: str
|
|
670
|
+
) -> list:
|
|
297
671
|
"""List column lineage by table name and column name."""
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
672
|
+
logger.debug(f"Getting column lineage for {table_name}.{column_name}")
|
|
673
|
+
try:
|
|
674
|
+
return (
|
|
675
|
+
self._workspace_client.api_client.do( # type: ignore
|
|
676
|
+
"GET",
|
|
677
|
+
"/api/2.0/lineage-tracking/column-lineage",
|
|
678
|
+
body={"table_name": table_name, "column_name": column_name},
|
|
679
|
+
).get("upstream_cols")
|
|
680
|
+
or []
|
|
681
|
+
)
|
|
682
|
+
except Exception as e:
|
|
683
|
+
logger.warning(
|
|
684
|
+
f"Error getting column lineage on table {table_name}, column {column_name}: {e}",
|
|
685
|
+
exc_info=True,
|
|
686
|
+
)
|
|
687
|
+
return []
|
|
303
688
|
|
|
304
|
-
def table_lineage(
|
|
689
|
+
def table_lineage(
|
|
690
|
+
self,
|
|
691
|
+
table: Table,
|
|
692
|
+
include_entity_lineage: bool,
|
|
693
|
+
start_time: Optional[datetime] = None,
|
|
694
|
+
end_time: Optional[datetime] = None,
|
|
695
|
+
) -> None:
|
|
305
696
|
if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
|
|
306
697
|
# Lineage is not available for Hive Metastore Tables.
|
|
307
698
|
return None
|
|
308
|
-
# Lineage endpoint doesn't exists on 2.1 version
|
|
309
|
-
try:
|
|
310
|
-
response: dict = self.list_lineages_by_table(
|
|
311
|
-
table_name=table.ref.qualified_table_name,
|
|
312
|
-
include_entity_lineage=include_entity_lineage,
|
|
313
|
-
)
|
|
314
699
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
for notebook in item.get("notebookInfos") or []:
|
|
330
|
-
table.upstream_notebooks.add(notebook["notebook_id"])
|
|
700
|
+
try:
|
|
701
|
+
# Determine lineage data source based on config
|
|
702
|
+
use_system_tables = False
|
|
703
|
+
if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
|
|
704
|
+
use_system_tables = True
|
|
705
|
+
elif self.lineage_data_source == LineageDataSource.API:
|
|
706
|
+
use_system_tables = False
|
|
707
|
+
elif self.lineage_data_source == LineageDataSource.AUTO:
|
|
708
|
+
# Use the newer system tables if we have a SQL warehouse, otherwise fall back
|
|
709
|
+
# to the older (and slower) HTTP API.
|
|
710
|
+
use_system_tables = bool(self.warehouse_id)
|
|
711
|
+
else:
|
|
712
|
+
assert_never(self.lineage_data_source)
|
|
331
713
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
714
|
+
if use_system_tables:
|
|
715
|
+
self._process_system_table_lineage(table, start_time, end_time)
|
|
716
|
+
else:
|
|
717
|
+
self._process_table_lineage_via_http_api(table, include_entity_lineage)
|
|
335
718
|
except Exception as e:
|
|
336
719
|
logger.warning(
|
|
337
720
|
f"Error getting lineage on table {table.ref}: {e}", exc_info=True
|
|
338
721
|
)
|
|
339
722
|
|
|
340
|
-
def
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
723
|
+
def _process_system_table_lineage(
|
|
724
|
+
self,
|
|
725
|
+
table: Table,
|
|
726
|
+
start_time: Optional[datetime] = None,
|
|
727
|
+
end_time: Optional[datetime] = None,
|
|
728
|
+
) -> None:
|
|
729
|
+
"""Process table lineage using system.access.table_lineage table."""
|
|
730
|
+
catalog_lineage = self.get_catalog_table_lineage_via_system_tables(
|
|
731
|
+
table.ref.catalog, start_time, end_time
|
|
732
|
+
)
|
|
733
|
+
table_full_name = table.ref.qualified_table_name
|
|
734
|
+
|
|
735
|
+
lineage_info = catalog_lineage.get(table_full_name, TableLineageInfo())
|
|
736
|
+
|
|
737
|
+
# Process table upstreams
|
|
738
|
+
for upstream in lineage_info.upstreams:
|
|
739
|
+
upstream_table_name = upstream.table_name
|
|
740
|
+
# Parse catalog.schema.table format
|
|
741
|
+
parts = upstream_table_name.split(".")
|
|
742
|
+
if len(parts) == 3:
|
|
743
|
+
catalog_name, schema_name, table_name = parts[0], parts[1], parts[2]
|
|
744
|
+
table_ref = TableReference(
|
|
745
|
+
metastore=table.schema.catalog.metastore.id
|
|
746
|
+
if table.schema.catalog.metastore
|
|
747
|
+
else None,
|
|
748
|
+
catalog=catalog_name,
|
|
749
|
+
schema=schema_name,
|
|
750
|
+
table=table_name,
|
|
751
|
+
last_updated=upstream.last_updated,
|
|
752
|
+
)
|
|
753
|
+
table.upstreams[table_ref] = {}
|
|
754
|
+
else:
|
|
755
|
+
logger.warning(
|
|
756
|
+
f"Unexpected upstream table format: {upstream_table_name} for table {table_full_name}"
|
|
757
|
+
)
|
|
758
|
+
continue
|
|
759
|
+
|
|
760
|
+
# Process external upstreams
|
|
761
|
+
for external_upstream in lineage_info.external_upstreams:
|
|
762
|
+
external_ref = ExternalTableReference(
|
|
763
|
+
path=external_upstream.path,
|
|
764
|
+
has_permission=True,
|
|
765
|
+
name=None,
|
|
766
|
+
type=None,
|
|
767
|
+
storage_location=external_upstream.path,
|
|
768
|
+
last_updated=external_upstream.last_updated,
|
|
345
769
|
)
|
|
346
|
-
|
|
770
|
+
table.external_upstreams.add(external_ref)
|
|
771
|
+
|
|
772
|
+
# Process upstream notebook lineage
|
|
773
|
+
for notebook_ref in lineage_info.upstream_notebooks:
|
|
774
|
+
existing_ref = table.upstream_notebooks.get(notebook_ref.id)
|
|
775
|
+
if existing_ref is None or (
|
|
776
|
+
notebook_ref.last_updated
|
|
777
|
+
and existing_ref.last_updated
|
|
778
|
+
and notebook_ref.last_updated > existing_ref.last_updated
|
|
779
|
+
):
|
|
780
|
+
table.upstream_notebooks[notebook_ref.id] = notebook_ref
|
|
781
|
+
|
|
782
|
+
# Process downstream notebook lineage
|
|
783
|
+
for notebook_ref in lineage_info.downstream_notebooks:
|
|
784
|
+
existing_ref = table.downstream_notebooks.get(notebook_ref.id)
|
|
785
|
+
if existing_ref is None or (
|
|
786
|
+
notebook_ref.last_updated
|
|
787
|
+
and existing_ref.last_updated
|
|
788
|
+
and notebook_ref.last_updated > existing_ref.last_updated
|
|
789
|
+
):
|
|
790
|
+
table.downstream_notebooks[notebook_ref.id] = notebook_ref
|
|
791
|
+
|
|
792
|
+
def _process_table_lineage_via_http_api(
|
|
793
|
+
self, table: Table, include_entity_lineage: bool
|
|
794
|
+
) -> None:
|
|
795
|
+
"""Process table lineage using the HTTP API (legacy fallback)."""
|
|
796
|
+
response: dict = self.list_lineages_by_table_via_http_api(
|
|
797
|
+
table_name=table.ref.qualified_table_name,
|
|
798
|
+
include_entity_lineage=include_entity_lineage,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
for item in response.get("upstreams") or []:
|
|
802
|
+
if "tableInfo" in item:
|
|
347
803
|
table_ref = TableReference.create_from_lineage(
|
|
348
|
-
item, table.schema.catalog.metastore
|
|
804
|
+
item["tableInfo"], table.schema.catalog.metastore
|
|
349
805
|
)
|
|
350
806
|
if table_ref:
|
|
351
|
-
table.upstreams
|
|
352
|
-
|
|
353
|
-
|
|
807
|
+
table.upstreams[table_ref] = {}
|
|
808
|
+
elif "fileInfo" in item:
|
|
809
|
+
external_ref = ExternalTableReference.create_from_lineage(
|
|
810
|
+
item["fileInfo"]
|
|
811
|
+
)
|
|
812
|
+
if external_ref:
|
|
813
|
+
table.external_upstreams.add(external_ref)
|
|
814
|
+
|
|
815
|
+
for notebook in item.get("notebookInfos") or []:
|
|
816
|
+
notebook_ref = NotebookReference(
|
|
817
|
+
id=notebook["notebook_id"],
|
|
818
|
+
)
|
|
819
|
+
table.upstream_notebooks[notebook_ref.id] = notebook_ref
|
|
820
|
+
|
|
821
|
+
for item in response.get("downstreams") or []:
|
|
822
|
+
for notebook in item.get("notebookInfos") or []:
|
|
823
|
+
notebook_ref = NotebookReference(
|
|
824
|
+
id=notebook["notebook_id"],
|
|
825
|
+
)
|
|
826
|
+
table.downstream_notebooks[notebook_ref.id] = notebook_ref
|
|
827
|
+
|
|
828
|
+
def get_column_lineage(
|
|
829
|
+
self,
|
|
830
|
+
table: Table,
|
|
831
|
+
column_names: List[str],
|
|
832
|
+
*,
|
|
833
|
+
max_workers: Optional[int] = None,
|
|
834
|
+
start_time: Optional[datetime] = None,
|
|
835
|
+
end_time: Optional[datetime] = None,
|
|
836
|
+
) -> None:
|
|
837
|
+
try:
|
|
838
|
+
# Determine lineage data source based on config
|
|
839
|
+
use_system_tables = False
|
|
840
|
+
if self.lineage_data_source == LineageDataSource.SYSTEM_TABLES:
|
|
841
|
+
use_system_tables = True
|
|
842
|
+
elif self.lineage_data_source == LineageDataSource.API:
|
|
843
|
+
use_system_tables = False
|
|
844
|
+
elif self.lineage_data_source == LineageDataSource.AUTO:
|
|
845
|
+
# Use the newer system tables if we have a SQL warehouse, otherwise fall back
|
|
846
|
+
# to the older (and slower) HTTP API.
|
|
847
|
+
use_system_tables = bool(self.warehouse_id)
|
|
848
|
+
else:
|
|
849
|
+
assert_never(self.lineage_data_source)
|
|
850
|
+
|
|
851
|
+
if use_system_tables:
|
|
852
|
+
lineage = (
|
|
853
|
+
self.get_catalog_column_lineage_via_system_tables(
|
|
854
|
+
table.ref.catalog, start_time, end_time
|
|
855
|
+
)
|
|
856
|
+
.get(table.ref.schema, {})
|
|
857
|
+
.get(table.ref.table, {})
|
|
858
|
+
)
|
|
859
|
+
else:
|
|
860
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
861
|
+
futures = [
|
|
862
|
+
executor.submit(
|
|
863
|
+
self.list_lineages_by_column_via_http_api,
|
|
864
|
+
table.ref.qualified_table_name,
|
|
865
|
+
column_name,
|
|
866
|
+
)
|
|
867
|
+
for column_name in column_names
|
|
868
|
+
]
|
|
869
|
+
lineage = {
|
|
870
|
+
column_name: future.result()
|
|
871
|
+
for column_name, future in zip(column_names, futures)
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
for column_name in column_names:
|
|
875
|
+
for item in lineage.get(column_name) or []:
|
|
876
|
+
table_ref = TableReference.create_from_lineage(
|
|
877
|
+
item,
|
|
878
|
+
table.schema.catalog.metastore,
|
|
879
|
+
)
|
|
880
|
+
if table_ref:
|
|
881
|
+
table.upstreams.setdefault(table_ref, {}).setdefault(
|
|
882
|
+
column_name, []
|
|
883
|
+
).append(item["name"])
|
|
884
|
+
|
|
354
885
|
except Exception as e:
|
|
355
886
|
logger.warning(
|
|
356
|
-
f"Error getting column lineage on table {table.ref}
|
|
887
|
+
f"Error getting column lineage on table {table.ref}: {e}",
|
|
357
888
|
exc_info=True,
|
|
358
889
|
)
|
|
359
890
|
|
|
@@ -461,6 +992,45 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
461
992
|
if optional_column:
|
|
462
993
|
yield optional_column
|
|
463
994
|
|
|
995
|
+
def _create_ml_model(
|
|
996
|
+
self, schema: Schema, obj: RegisteredModelInfo
|
|
997
|
+
) -> Optional[Model]:
|
|
998
|
+
if not obj.name or not obj.full_name:
|
|
999
|
+
self.report.num_ml_models_missing_name += 1
|
|
1000
|
+
return None
|
|
1001
|
+
return Model(
|
|
1002
|
+
id=obj.full_name,
|
|
1003
|
+
name=obj.name,
|
|
1004
|
+
description=obj.comment,
|
|
1005
|
+
schema_name=schema.name,
|
|
1006
|
+
catalog_name=schema.catalog.name,
|
|
1007
|
+
created_at=parse_ts_millis(obj.created_at),
|
|
1008
|
+
updated_at=parse_ts_millis(obj.updated_at),
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
def _create_ml_model_version(
|
|
1012
|
+
self, model: Model, obj: ModelVersionInfo
|
|
1013
|
+
) -> Optional[ModelVersion]:
|
|
1014
|
+
if obj.version is None:
|
|
1015
|
+
return None
|
|
1016
|
+
|
|
1017
|
+
aliases = []
|
|
1018
|
+
if obj.aliases:
|
|
1019
|
+
for alias in obj.aliases:
|
|
1020
|
+
if alias.alias_name:
|
|
1021
|
+
aliases.append(alias.alias_name)
|
|
1022
|
+
return ModelVersion(
|
|
1023
|
+
id=f"{model.id}_{obj.version}",
|
|
1024
|
+
name=f"{model.name}_{obj.version}",
|
|
1025
|
+
model=model,
|
|
1026
|
+
version=str(obj.version),
|
|
1027
|
+
aliases=aliases,
|
|
1028
|
+
description=obj.comment,
|
|
1029
|
+
created_at=parse_ts_millis(obj.created_at),
|
|
1030
|
+
updated_at=parse_ts_millis(obj.updated_at),
|
|
1031
|
+
created_by=obj.created_by,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
464
1034
|
def _create_service_principal(
|
|
465
1035
|
self, obj: DatabricksServicePrincipal
|
|
466
1036
|
) -> Optional[ServicePrincipal]:
|
|
@@ -492,3 +1062,176 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
492
1062
|
executed_as_user_id=info.executed_as_user_id,
|
|
493
1063
|
executed_as_user_name=info.executed_as_user_name,
|
|
494
1064
|
)
|
|
1065
|
+
|
|
1066
|
+
def _execute_sql_query(self, query: str, params: Sequence[Any] = ()) -> List[Row]:
|
|
1067
|
+
"""Execute SQL query using databricks-sql connector for better performance"""
|
|
1068
|
+
logger.debug(f"Executing SQL query with {len(params)} parameters")
|
|
1069
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
1070
|
+
# Only log full query in debug mode to avoid performance overhead
|
|
1071
|
+
logger.debug(f"Full SQL query: {query}")
|
|
1072
|
+
if params:
|
|
1073
|
+
logger.debug(f"Query parameters: {params}")
|
|
1074
|
+
|
|
1075
|
+
# Check if warehouse_id is available for SQL operations
|
|
1076
|
+
if not self.warehouse_id:
|
|
1077
|
+
self.report.report_warning(
|
|
1078
|
+
"Cannot execute SQL query",
|
|
1079
|
+
"warehouse_id is not configured. SQL operations require a valid warehouse_id to be set in the Unity Catalog configuration",
|
|
1080
|
+
)
|
|
1081
|
+
logger.warning(
|
|
1082
|
+
"Cannot execute SQL query: warehouse_id is not configured. "
|
|
1083
|
+
"SQL operations require a valid warehouse_id to be set in the Unity Catalog configuration."
|
|
1084
|
+
)
|
|
1085
|
+
return []
|
|
1086
|
+
|
|
1087
|
+
# Log connection parameters (with masked token)
|
|
1088
|
+
masked_params = {**self._sql_connection_params}
|
|
1089
|
+
if "access_token" in masked_params:
|
|
1090
|
+
masked_params["access_token"] = "***MASKED***"
|
|
1091
|
+
logger.debug(f"Using connection parameters: {masked_params}")
|
|
1092
|
+
|
|
1093
|
+
# Log proxy environment variables that affect SQL connections
|
|
1094
|
+
proxy_env_debug = {}
|
|
1095
|
+
for var in ["HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy"]:
|
|
1096
|
+
value = os.environ.get(var)
|
|
1097
|
+
if value:
|
|
1098
|
+
proxy_env_debug[var] = mask_proxy_credentials(value)
|
|
1099
|
+
|
|
1100
|
+
if proxy_env_debug:
|
|
1101
|
+
logger.debug(
|
|
1102
|
+
f"SQL connection will use proxy environment variables: {proxy_env_debug}"
|
|
1103
|
+
)
|
|
1104
|
+
else:
|
|
1105
|
+
logger.debug("No proxy environment variables detected for SQL connection")
|
|
1106
|
+
|
|
1107
|
+
try:
|
|
1108
|
+
with (
|
|
1109
|
+
connect(**self._sql_connection_params) as connection,
|
|
1110
|
+
connection.cursor() as cursor,
|
|
1111
|
+
):
|
|
1112
|
+
cursor.execute(query, list(params))
|
|
1113
|
+
rows = cursor.fetchall()
|
|
1114
|
+
logger.debug(
|
|
1115
|
+
f"SQL query executed successfully, returned {len(rows)} rows"
|
|
1116
|
+
)
|
|
1117
|
+
return rows
|
|
1118
|
+
|
|
1119
|
+
except Exception as e:
|
|
1120
|
+
logger.warning(f"Failed to execute SQL query: {e}", exc_info=True)
|
|
1121
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
1122
|
+
# Only log failed query details in debug mode for security
|
|
1123
|
+
logger.debug(f"SQL query that failed: {query}")
|
|
1124
|
+
logger.debug(f"SQL query parameters: {params}")
|
|
1125
|
+
|
|
1126
|
+
# Check if this might be a proxy-related error
|
|
1127
|
+
error_str = str(e).lower()
|
|
1128
|
+
if any(
|
|
1129
|
+
proxy_keyword in error_str
|
|
1130
|
+
for proxy_keyword in [
|
|
1131
|
+
"proxy",
|
|
1132
|
+
"407",
|
|
1133
|
+
"authentication required",
|
|
1134
|
+
"tunnel",
|
|
1135
|
+
"connect",
|
|
1136
|
+
]
|
|
1137
|
+
):
|
|
1138
|
+
logger.error(
|
|
1139
|
+
"SQL query failure appears to be proxy-related. "
|
|
1140
|
+
"Please check proxy configuration and authentication. "
|
|
1141
|
+
f"Proxy environment variables detected: {list(proxy_env_debug.keys())}"
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
return []
|
|
1145
|
+
|
|
1146
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
1147
|
+
def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
1148
|
+
"""Optimized version using databricks-sql"""
|
|
1149
|
+
logger.info(f"Fetching schema tags for catalog: `{catalog}`")
|
|
1150
|
+
|
|
1151
|
+
query = f"SELECT * FROM `{catalog}`.information_schema.schema_tags"
|
|
1152
|
+
rows = self._execute_sql_query(query)
|
|
1153
|
+
|
|
1154
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
1155
|
+
|
|
1156
|
+
for row in rows:
|
|
1157
|
+
catalog_name, schema_name, tag_name, tag_value = row
|
|
1158
|
+
schema_key = f"{catalog_name}.{schema_name}"
|
|
1159
|
+
|
|
1160
|
+
if schema_key not in result_dict:
|
|
1161
|
+
result_dict[schema_key] = []
|
|
1162
|
+
|
|
1163
|
+
result_dict[schema_key].append(
|
|
1164
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
return result_dict
|
|
1168
|
+
|
|
1169
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
1170
|
+
def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
1171
|
+
"""Optimized version using databricks-sql"""
|
|
1172
|
+
logger.info(f"Fetching table tags for catalog: `{catalog}`")
|
|
1173
|
+
|
|
1174
|
+
query = f"SELECT * FROM `{catalog}`.information_schema.catalog_tags"
|
|
1175
|
+
rows = self._execute_sql_query(query)
|
|
1176
|
+
|
|
1177
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
1178
|
+
|
|
1179
|
+
for row in rows:
|
|
1180
|
+
catalog_name, tag_name, tag_value = row
|
|
1181
|
+
|
|
1182
|
+
if catalog_name not in result_dict:
|
|
1183
|
+
result_dict[catalog_name] = []
|
|
1184
|
+
|
|
1185
|
+
result_dict[catalog_name].append(
|
|
1186
|
+
UnityCatalogTag(key=tag_name, value=tag_value)
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
return result_dict
|
|
1190
|
+
|
|
1191
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
1192
|
+
def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
1193
|
+
"""Optimized version using databricks-sql"""
|
|
1194
|
+
logger.info(f"Fetching table tags for catalog: `{catalog}`")
|
|
1195
|
+
|
|
1196
|
+
query = f"SELECT * FROM `{catalog}`.information_schema.table_tags"
|
|
1197
|
+
rows = self._execute_sql_query(query)
|
|
1198
|
+
|
|
1199
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
1200
|
+
|
|
1201
|
+
for row in rows:
|
|
1202
|
+
catalog_name, schema_name, table_name, tag_name, tag_value = row
|
|
1203
|
+
table_key = f"{catalog_name}.{schema_name}.{table_name}"
|
|
1204
|
+
|
|
1205
|
+
if table_key not in result_dict:
|
|
1206
|
+
result_dict[table_key] = []
|
|
1207
|
+
|
|
1208
|
+
result_dict[table_key].append(
|
|
1209
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
return result_dict
|
|
1213
|
+
|
|
1214
|
+
@cached(cachetools.FIFOCache(maxsize=_MAX_CONCURRENT_CATALOGS))
|
|
1215
|
+
def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
|
|
1216
|
+
"""Optimized version using databricks-sql"""
|
|
1217
|
+
logger.info(f"Fetching column tags for catalog: `{catalog}`")
|
|
1218
|
+
|
|
1219
|
+
query = f"SELECT * FROM `{catalog}`.information_schema.column_tags"
|
|
1220
|
+
rows = self._execute_sql_query(query)
|
|
1221
|
+
|
|
1222
|
+
result_dict: Dict[str, List[UnityCatalogTag]] = {}
|
|
1223
|
+
|
|
1224
|
+
for row in rows:
|
|
1225
|
+
catalog_name, schema_name, table_name, column_name, tag_name, tag_value = (
|
|
1226
|
+
row
|
|
1227
|
+
)
|
|
1228
|
+
column_key = f"{catalog_name}.{schema_name}.{table_name}.{column_name}"
|
|
1229
|
+
|
|
1230
|
+
if column_key not in result_dict:
|
|
1231
|
+
result_dict[column_key] = []
|
|
1232
|
+
|
|
1233
|
+
result_dict[column_key].append(
|
|
1234
|
+
UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
return result_dict
|