acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
-
|
|
3
|
+
import time
|
|
4
4
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
5
|
from urllib.parse import urljoin
|
|
6
6
|
|
|
7
|
+
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
7
8
|
from datahub.emitter.mce_builder import (
|
|
9
|
+
UNKNOWN_USER,
|
|
8
10
|
make_data_platform_urn,
|
|
9
11
|
make_dataplatform_instance_urn,
|
|
10
12
|
make_dataset_urn_with_platform_instance,
|
|
11
13
|
make_domain_urn,
|
|
12
14
|
make_group_urn,
|
|
15
|
+
make_ml_model_group_urn,
|
|
13
16
|
make_schema_field_urn,
|
|
17
|
+
make_ts_millis,
|
|
14
18
|
make_user_urn,
|
|
15
19
|
)
|
|
16
20
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -23,9 +27,9 @@ from datahub.emitter.mcp_builder import (
|
|
|
23
27
|
UnitySchemaKey,
|
|
24
28
|
UnitySchemaKeyWithMetastore,
|
|
25
29
|
add_dataset_to_container,
|
|
30
|
+
add_entity_to_container,
|
|
26
31
|
gen_containers,
|
|
27
32
|
)
|
|
28
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
29
33
|
from datahub.ingestion.api.common import PipelineContext
|
|
30
34
|
from datahub.ingestion.api.decorators import (
|
|
31
35
|
SupportStatus,
|
|
@@ -53,6 +57,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
53
57
|
from datahub.ingestion.source.common.subtypes import (
|
|
54
58
|
DatasetContainerSubTypes,
|
|
55
59
|
DatasetSubTypes,
|
|
60
|
+
SourceCapabilityModifier,
|
|
56
61
|
)
|
|
57
62
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
58
63
|
StaleEntityRemovalHandler,
|
|
@@ -72,13 +77,19 @@ from datahub.ingestion.source.unity.hive_metastore_proxy import (
|
|
|
72
77
|
HIVE_METASTORE,
|
|
73
78
|
HiveMetastoreProxy,
|
|
74
79
|
)
|
|
80
|
+
from datahub.ingestion.source.unity.platform_resource_repository import (
|
|
81
|
+
UnityCatalogPlatformResourceRepository,
|
|
82
|
+
)
|
|
75
83
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
76
84
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
77
85
|
DATA_TYPE_REGISTRY,
|
|
78
86
|
Catalog,
|
|
79
87
|
Column,
|
|
80
88
|
CustomCatalogType,
|
|
89
|
+
HiveTableType,
|
|
81
90
|
Metastore,
|
|
91
|
+
Model,
|
|
92
|
+
ModelVersion,
|
|
82
93
|
Notebook,
|
|
83
94
|
NotebookId,
|
|
84
95
|
Schema,
|
|
@@ -87,8 +98,17 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
87
98
|
TableReference,
|
|
88
99
|
)
|
|
89
100
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
|
101
|
+
from datahub.ingestion.source.unity.tag_entities import (
|
|
102
|
+
UnityCatalogTagPlatformResource,
|
|
103
|
+
UnityCatalogTagPlatformResourceId,
|
|
104
|
+
)
|
|
90
105
|
from datahub.ingestion.source.unity.usage import UnityCatalogUsageExtractor
|
|
91
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import
|
|
106
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
107
|
+
GlobalTags,
|
|
108
|
+
MetadataAttribution,
|
|
109
|
+
Siblings,
|
|
110
|
+
TagAssociation,
|
|
111
|
+
)
|
|
92
112
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
93
113
|
DatasetLineageType,
|
|
94
114
|
FineGrainedLineage,
|
|
@@ -98,11 +118,13 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
98
118
|
ViewProperties,
|
|
99
119
|
)
|
|
100
120
|
from datahub.metadata.schema_classes import (
|
|
121
|
+
AuditStampClass,
|
|
101
122
|
BrowsePathsClass,
|
|
102
123
|
DataPlatformInstanceClass,
|
|
103
124
|
DatasetLineageTypeClass,
|
|
104
125
|
DatasetPropertiesClass,
|
|
105
126
|
DomainsClass,
|
|
127
|
+
MLModelPropertiesClass,
|
|
106
128
|
MySqlDDLClass,
|
|
107
129
|
NullTypeClass,
|
|
108
130
|
OwnerClass,
|
|
@@ -116,12 +138,10 @@ from datahub.metadata.schema_classes import (
|
|
|
116
138
|
UpstreamClass,
|
|
117
139
|
UpstreamLineageClass,
|
|
118
140
|
)
|
|
141
|
+
from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
|
|
142
|
+
from datahub.sdk import MLModel, MLModelGroup
|
|
119
143
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
120
|
-
from datahub.sql_parsing.
|
|
121
|
-
SqlParsingResult,
|
|
122
|
-
sqlglot_lineage,
|
|
123
|
-
view_definition_lineage_helper,
|
|
124
|
-
)
|
|
144
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
125
145
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
126
146
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
127
147
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -138,23 +158,32 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
138
158
|
@capability(SourceCapability.USAGE_STATS, "Enabled by default")
|
|
139
159
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
140
160
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
141
|
-
@capability(
|
|
161
|
+
@capability(
|
|
162
|
+
SourceCapability.CONTAINERS,
|
|
163
|
+
"Enabled by default",
|
|
164
|
+
subtype_modifier=[
|
|
165
|
+
SourceCapabilityModifier.CATALOG,
|
|
166
|
+
SourceCapabilityModifier.SCHEMA,
|
|
167
|
+
],
|
|
168
|
+
)
|
|
142
169
|
@capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
|
|
143
170
|
@capability(
|
|
144
171
|
SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
|
|
145
172
|
)
|
|
146
173
|
@capability(
|
|
147
174
|
SourceCapability.DELETION_DETECTION,
|
|
148
|
-
"
|
|
175
|
+
"Enabled by default via stateful ingestion",
|
|
149
176
|
supported=True,
|
|
150
177
|
)
|
|
151
|
-
@
|
|
178
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
179
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
152
180
|
class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
153
181
|
"""
|
|
154
182
|
This plugin extracts the following metadata from Databricks Unity Catalog:
|
|
155
183
|
- metastores
|
|
156
184
|
- schemas
|
|
157
185
|
- tables and column lineage
|
|
186
|
+
- model and model versions
|
|
158
187
|
"""
|
|
159
188
|
|
|
160
189
|
config: UnityCatalogSourceConfig
|
|
@@ -162,6 +191,10 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
162
191
|
platform: str = "databricks"
|
|
163
192
|
platform_instance_name: Optional[str]
|
|
164
193
|
sql_parser_schema_resolver: Optional[SchemaResolver] = None
|
|
194
|
+
platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
|
|
195
|
+
None
|
|
196
|
+
)
|
|
197
|
+
sql_parsing_aggregator: Optional[SqlParsingAggregator] = None
|
|
165
198
|
|
|
166
199
|
def get_report(self) -> UnityCatalogReport:
|
|
167
200
|
return self.report
|
|
@@ -180,6 +213,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
180
213
|
config.warehouse_id,
|
|
181
214
|
report=self.report,
|
|
182
215
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
216
|
+
lineage_data_source=config.lineage_data_source,
|
|
217
|
+
usage_data_source=config.usage_data_source,
|
|
218
|
+
databricks_api_page_size=config.databricks_api_page_size,
|
|
183
219
|
)
|
|
184
220
|
|
|
185
221
|
self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
|
|
@@ -205,12 +241,31 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
205
241
|
self.table_refs: Set[TableReference] = set()
|
|
206
242
|
self.view_refs: Set[TableReference] = set()
|
|
207
243
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
208
|
-
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
209
|
-
FileBackedDict()
|
|
210
|
-
)
|
|
211
244
|
|
|
212
245
|
# Global map of tables, for profiling
|
|
213
246
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
247
|
+
if self.ctx.graph:
|
|
248
|
+
self.platform_resource_repository = UnityCatalogPlatformResourceRepository(
|
|
249
|
+
self.ctx.graph, platform_instance=self.platform_instance_name
|
|
250
|
+
)
|
|
251
|
+
else:
|
|
252
|
+
self.platform_resource_repository = None
|
|
253
|
+
|
|
254
|
+
if self.config._forced_disable_tag_extraction:
|
|
255
|
+
self.report.report_warning(
|
|
256
|
+
"Some features disabled because of configuration conflicts",
|
|
257
|
+
"Tag Extraction is disabled due to missing warehouse_id in config",
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
if self.config._forced_disable_hive_metastore_extraction:
|
|
261
|
+
self.report.report_warning(
|
|
262
|
+
"Some features disabled because of configuration conflicts",
|
|
263
|
+
"Hive Metastore Extraction is disabled due to missing warehouse_id in config",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Include platform resource repository in report for automatic cache statistics
|
|
267
|
+
if self.config.include_tags and self.platform_resource_repository:
|
|
268
|
+
self.report.tag_urn_resolver_cache = self.platform_resource_repository
|
|
214
269
|
|
|
215
270
|
def init_hive_metastore_proxy(self):
|
|
216
271
|
self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
|
|
@@ -229,6 +284,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
229
284
|
platform_instance=self.config.platform_instance,
|
|
230
285
|
env=self.config.env,
|
|
231
286
|
)
|
|
287
|
+
self.sql_parsing_aggregator = SqlParsingAggregator(
|
|
288
|
+
platform=self.platform,
|
|
289
|
+
platform_instance=self.config.platform_instance,
|
|
290
|
+
env=self.config.env,
|
|
291
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
292
|
+
generate_lineage=True,
|
|
293
|
+
generate_queries=False,
|
|
294
|
+
generate_usage_statistics=False,
|
|
295
|
+
generate_operations=False,
|
|
296
|
+
)
|
|
297
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
232
298
|
except Exception as e:
|
|
233
299
|
logger.debug("Exception", exc_info=True)
|
|
234
300
|
self.warn(
|
|
@@ -383,12 +449,12 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
383
449
|
self.config.workspace_url, f"#notebook/{notebook.id}"
|
|
384
450
|
),
|
|
385
451
|
created=(
|
|
386
|
-
TimeStampClass(
|
|
452
|
+
TimeStampClass(make_ts_millis(notebook.created_at))
|
|
387
453
|
if notebook.created_at
|
|
388
454
|
else None
|
|
389
455
|
),
|
|
390
456
|
lastModified=(
|
|
391
|
-
TimeStampClass(
|
|
457
|
+
TimeStampClass(make_ts_millis(notebook.modified_at))
|
|
392
458
|
if notebook.modified_at
|
|
393
459
|
else None
|
|
394
460
|
),
|
|
@@ -407,17 +473,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
407
473
|
if not notebook.upstreams:
|
|
408
474
|
return None
|
|
409
475
|
|
|
476
|
+
upstreams = []
|
|
477
|
+
for upstream_ref in notebook.upstreams:
|
|
478
|
+
timestamp = make_ts_millis(upstream_ref.last_updated)
|
|
479
|
+
upstreams.append(
|
|
480
|
+
self._create_upstream_class(
|
|
481
|
+
self.gen_dataset_urn(upstream_ref),
|
|
482
|
+
DatasetLineageTypeClass.COPY,
|
|
483
|
+
timestamp,
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
|
|
410
487
|
return MetadataChangeProposalWrapper(
|
|
411
488
|
entityUrn=self.gen_notebook_urn(notebook),
|
|
412
|
-
aspect=UpstreamLineageClass(
|
|
413
|
-
upstreams=[
|
|
414
|
-
UpstreamClass(
|
|
415
|
-
dataset=self.gen_dataset_urn(upstream_ref),
|
|
416
|
-
type=DatasetLineageTypeClass.COPY,
|
|
417
|
-
)
|
|
418
|
-
for upstream_ref in notebook.upstreams
|
|
419
|
-
]
|
|
420
|
-
),
|
|
489
|
+
aspect=UpstreamLineageClass(upstreams=upstreams),
|
|
421
490
|
).as_workunit()
|
|
422
491
|
|
|
423
492
|
def process_metastores(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -436,14 +505,15 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
436
505
|
self, metastore: Optional[Metastore]
|
|
437
506
|
) -> Iterable[MetadataWorkUnit]:
|
|
438
507
|
for catalog in self._get_catalogs(metastore):
|
|
439
|
-
|
|
440
|
-
self.
|
|
441
|
-
|
|
508
|
+
with self.report.new_stage(f"Ingest catalog {catalog.id}"):
|
|
509
|
+
if not self.config.catalog_pattern.allowed(catalog.id):
|
|
510
|
+
self.report.catalogs.dropped(catalog.id)
|
|
511
|
+
continue
|
|
442
512
|
|
|
443
|
-
|
|
444
|
-
|
|
513
|
+
yield from self.gen_catalog_containers(catalog)
|
|
514
|
+
yield from self.process_schemas(catalog)
|
|
445
515
|
|
|
446
|
-
|
|
516
|
+
self.report.catalogs.processed(catalog.id)
|
|
447
517
|
|
|
448
518
|
def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
|
|
449
519
|
if self.config.catalogs:
|
|
@@ -466,6 +536,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
466
536
|
yield from self.gen_schema_containers(schema)
|
|
467
537
|
try:
|
|
468
538
|
yield from self.process_tables(schema)
|
|
539
|
+
yield from self.process_ml_models(schema)
|
|
469
540
|
except Exception as e:
|
|
470
541
|
logger.exception(f"Error parsing schema {schema}")
|
|
471
542
|
self.report.report_warning(
|
|
@@ -506,13 +577,42 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
506
577
|
yield from self.add_table_to_dataset_container(dataset_urn, schema)
|
|
507
578
|
|
|
508
579
|
table_props = self._create_table_property_aspect(table)
|
|
580
|
+
tags = None
|
|
581
|
+
if not isinstance(table.table_type, HiveTableType) and self.config.include_tags:
|
|
582
|
+
try:
|
|
583
|
+
table_tags = self._get_table_tags(
|
|
584
|
+
table.ref.catalog, table.ref.schema, table.ref.table
|
|
585
|
+
)
|
|
586
|
+
if table_tags:
|
|
587
|
+
logger.debug(f"Table tags for {table.ref}: {table_tags}")
|
|
588
|
+
attribution = MetadataAttribution(
|
|
589
|
+
# source="unity-catalog",
|
|
590
|
+
actor="urn:li:corpuser:datahub",
|
|
591
|
+
time=int(time.time() * 1000),
|
|
592
|
+
)
|
|
593
|
+
tags = GlobalTags(
|
|
594
|
+
tags=[
|
|
595
|
+
TagAssociation(
|
|
596
|
+
tag=tag.to_datahub_tag_urn().urn(),
|
|
597
|
+
attribution=attribution,
|
|
598
|
+
)
|
|
599
|
+
for tag in table_tags
|
|
600
|
+
]
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
yield from self.gen_platform_resources(table_tags)
|
|
604
|
+
|
|
605
|
+
except Exception as e:
|
|
606
|
+
logger.exception(f"Error fetching table {table.ref} tags", exc_info=e)
|
|
509
607
|
|
|
510
608
|
view_props = None
|
|
511
609
|
if table.view_definition:
|
|
512
610
|
view_props = self._create_view_property_aspect(table)
|
|
513
611
|
|
|
514
612
|
sub_type = self._create_table_sub_type_aspect(table)
|
|
515
|
-
schema_metadata = self._create_schema_metadata_aspect(table)
|
|
613
|
+
schema_metadata, platform_resources = self._create_schema_metadata_aspect(table)
|
|
614
|
+
yield from platform_resources
|
|
615
|
+
|
|
516
616
|
domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name)
|
|
517
617
|
ownership = self._create_table_ownership_aspect(table)
|
|
518
618
|
data_platform_instance = self._create_data_platform_instance_aspect()
|
|
@@ -534,8 +634,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
534
634
|
self.sql_parser_schema_resolver.add_schema_metadata(
|
|
535
635
|
dataset_urn, schema_metadata
|
|
536
636
|
)
|
|
537
|
-
if table.view_definition:
|
|
538
|
-
self.
|
|
637
|
+
if table.view_definition and self.sql_parsing_aggregator:
|
|
638
|
+
self.sql_parsing_aggregator.add_view_definition(
|
|
639
|
+
view_urn=dataset_urn,
|
|
640
|
+
view_definition=table.view_definition,
|
|
641
|
+
default_db=table.ref.catalog,
|
|
642
|
+
default_schema=table.ref.schema,
|
|
643
|
+
)
|
|
539
644
|
|
|
540
645
|
if (
|
|
541
646
|
table_props.customProperties.get("table_type")
|
|
@@ -585,29 +690,107 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
585
690
|
domain,
|
|
586
691
|
data_platform_instance,
|
|
587
692
|
lineage,
|
|
693
|
+
tags,
|
|
588
694
|
],
|
|
589
695
|
)
|
|
590
696
|
]
|
|
591
697
|
|
|
698
|
+
def process_ml_models(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
699
|
+
for ml_model in self.unity_catalog_api_proxy.ml_models(
|
|
700
|
+
schema=schema, max_results=self.config.ml_model_max_results
|
|
701
|
+
):
|
|
702
|
+
yield from self.process_ml_model(ml_model, schema)
|
|
703
|
+
ml_model_urn = self.gen_ml_model_urn(ml_model.id)
|
|
704
|
+
for ml_model_version in self.unity_catalog_api_proxy.ml_model_versions(
|
|
705
|
+
ml_model, include_aliases=self.config.include_ml_model_aliases
|
|
706
|
+
):
|
|
707
|
+
yield from self.process_ml_model_version(
|
|
708
|
+
ml_model_urn, ml_model_version, schema
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
def process_ml_model(
|
|
712
|
+
self, ml_model: Model, schema: Schema
|
|
713
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
714
|
+
ml_model_group = MLModelGroup(
|
|
715
|
+
id=ml_model.id,
|
|
716
|
+
name=ml_model.name,
|
|
717
|
+
platform=self.platform,
|
|
718
|
+
platform_instance=schema.name,
|
|
719
|
+
env=self.config.env,
|
|
720
|
+
description=ml_model.description,
|
|
721
|
+
created=ml_model.created_at,
|
|
722
|
+
last_modified=ml_model.updated_at,
|
|
723
|
+
)
|
|
724
|
+
yield from ml_model_group.as_workunits()
|
|
725
|
+
yield from self.add_model_to_schema_container(str(ml_model_group.urn), schema)
|
|
726
|
+
self.report.ml_models.processed(ml_model.id)
|
|
727
|
+
|
|
728
|
+
def process_ml_model_version(
|
|
729
|
+
self, ml_model_urn: str, ml_model_version: ModelVersion, schema: Schema
|
|
730
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
731
|
+
extra_aspects = []
|
|
732
|
+
if ml_model_version.created_at is not None:
|
|
733
|
+
created_time = int(ml_model_version.created_at.timestamp() * 1000)
|
|
734
|
+
created_actor = (
|
|
735
|
+
f"urn:li:platformResource:{ml_model_version.created_by}"
|
|
736
|
+
if ml_model_version.created_by
|
|
737
|
+
else None
|
|
738
|
+
)
|
|
739
|
+
extra_aspects.append(
|
|
740
|
+
MLModelPropertiesClass(
|
|
741
|
+
created=TimeStampClass(time=created_time, actor=created_actor),
|
|
742
|
+
)
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
ml_model = MLModel(
|
|
746
|
+
id=ml_model_version.id,
|
|
747
|
+
name=ml_model_version.name,
|
|
748
|
+
version=str(ml_model_version.version),
|
|
749
|
+
aliases=ml_model_version.aliases,
|
|
750
|
+
description=ml_model_version.description,
|
|
751
|
+
model_group=ml_model_urn,
|
|
752
|
+
platform=self.platform,
|
|
753
|
+
last_modified=ml_model_version.updated_at,
|
|
754
|
+
extra_aspects=extra_aspects,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
yield from ml_model.as_workunits()
|
|
758
|
+
yield from self.add_model_version_to_schema_container(str(ml_model.urn), schema)
|
|
759
|
+
self.report.ml_model_versions.processed(ml_model_version.id)
|
|
760
|
+
|
|
592
761
|
def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
|
|
762
|
+
# Calculate datetime filters for lineage
|
|
763
|
+
lineage_start_time = None
|
|
764
|
+
lineage_end_time = self.config.end_time
|
|
765
|
+
|
|
766
|
+
if self.config.ignore_start_time_lineage:
|
|
767
|
+
lineage_start_time = None # Ignore start time to get all lineage
|
|
768
|
+
else:
|
|
769
|
+
lineage_start_time = self.config.start_time
|
|
770
|
+
|
|
593
771
|
if self.config.include_table_lineage:
|
|
594
772
|
self.unity_catalog_api_proxy.table_lineage(
|
|
595
|
-
table,
|
|
773
|
+
table,
|
|
774
|
+
include_entity_lineage=self.config.include_notebooks,
|
|
775
|
+
start_time=lineage_start_time,
|
|
776
|
+
end_time=lineage_end_time,
|
|
596
777
|
)
|
|
597
778
|
|
|
598
779
|
if self.config.include_column_lineage and table.upstreams:
|
|
599
780
|
if len(table.columns) > self.config.column_lineage_column_limit:
|
|
600
781
|
self.report.num_column_lineage_skipped_column_count += 1
|
|
601
782
|
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
783
|
+
column_names = [
|
|
784
|
+
column.name
|
|
785
|
+
for column in table.columns[: self.config.column_lineage_column_limit]
|
|
786
|
+
]
|
|
787
|
+
self.unity_catalog_api_proxy.get_column_lineage(
|
|
788
|
+
table,
|
|
789
|
+
column_names,
|
|
790
|
+
max_workers=self.config.lineage_max_workers,
|
|
791
|
+
start_time=lineage_start_time,
|
|
792
|
+
end_time=lineage_end_time,
|
|
793
|
+
)
|
|
611
794
|
|
|
612
795
|
return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
|
|
613
796
|
|
|
@@ -635,18 +818,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
635
818
|
for d_col, u_cols in sorted(downstream_to_upstream_cols.items())
|
|
636
819
|
)
|
|
637
820
|
|
|
821
|
+
timestamp = make_ts_millis(upstream_ref.last_updated)
|
|
638
822
|
upstreams.append(
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
823
|
+
self._create_upstream_class(
|
|
824
|
+
upstream_urn,
|
|
825
|
+
DatasetLineageTypeClass.TRANSFORMED,
|
|
826
|
+
timestamp,
|
|
642
827
|
)
|
|
643
828
|
)
|
|
644
829
|
|
|
645
|
-
for notebook in table.upstream_notebooks:
|
|
830
|
+
for notebook in table.upstream_notebooks.values():
|
|
831
|
+
timestamp = make_ts_millis(notebook.last_updated)
|
|
646
832
|
upstreams.append(
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
833
|
+
self._create_upstream_class(
|
|
834
|
+
self.gen_notebook_urn(notebook.id),
|
|
835
|
+
DatasetLineageTypeClass.TRANSFORMED,
|
|
836
|
+
timestamp,
|
|
650
837
|
)
|
|
651
838
|
)
|
|
652
839
|
|
|
@@ -708,6 +895,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
708
895
|
env=self.config.env,
|
|
709
896
|
)
|
|
710
897
|
|
|
898
|
+
def gen_ml_model_urn(self, name: str) -> str:
|
|
899
|
+
return make_ml_model_group_urn(
|
|
900
|
+
platform=self.platform,
|
|
901
|
+
group_name=name,
|
|
902
|
+
env=self.config.env,
|
|
903
|
+
)
|
|
904
|
+
|
|
711
905
|
def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str:
|
|
712
906
|
notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook
|
|
713
907
|
return NotebookKey(
|
|
@@ -716,8 +910,41 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
716
910
|
instance=self.config.platform_instance,
|
|
717
911
|
).as_urn()
|
|
718
912
|
|
|
913
|
+
def _create_upstream_class(
|
|
914
|
+
self,
|
|
915
|
+
dataset_urn: str,
|
|
916
|
+
lineage_type: Union[str, DatasetLineageTypeClass],
|
|
917
|
+
timestamp: Optional[int],
|
|
918
|
+
) -> UpstreamClass:
|
|
919
|
+
"""
|
|
920
|
+
Helper method to create UpstreamClass with optional audit stamp.
|
|
921
|
+
If timestamp is None, audit stamp is omitted.
|
|
922
|
+
"""
|
|
923
|
+
if timestamp is not None:
|
|
924
|
+
return UpstreamClass(
|
|
925
|
+
dataset=dataset_urn,
|
|
926
|
+
type=lineage_type,
|
|
927
|
+
auditStamp=AuditStampClass(
|
|
928
|
+
time=timestamp,
|
|
929
|
+
actor=UNKNOWN_USER,
|
|
930
|
+
),
|
|
931
|
+
)
|
|
932
|
+
else:
|
|
933
|
+
return UpstreamClass(
|
|
934
|
+
dataset=dataset_urn,
|
|
935
|
+
type=lineage_type,
|
|
936
|
+
)
|
|
937
|
+
|
|
719
938
|
def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
720
939
|
domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
|
|
940
|
+
schema_tags = []
|
|
941
|
+
if self.config.include_tags:
|
|
942
|
+
schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
|
|
943
|
+
schema.catalog.name
|
|
944
|
+
).get(f"{schema.catalog.name}.{schema.name}", [])
|
|
945
|
+
logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
|
|
946
|
+
# Generate platform resources for schema tags
|
|
947
|
+
yield from self.gen_platform_resources(schema_tags)
|
|
721
948
|
|
|
722
949
|
schema_container_key = self.gen_schema_key(schema)
|
|
723
950
|
yield from gen_containers(
|
|
@@ -729,6 +956,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
729
956
|
description=schema.comment,
|
|
730
957
|
owner_urn=self.get_owner_urn(schema.owner),
|
|
731
958
|
external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
|
|
959
|
+
tags=[tag.to_datahub_tag_urn().name for tag in schema_tags]
|
|
960
|
+
if schema_tags
|
|
961
|
+
else None,
|
|
732
962
|
)
|
|
733
963
|
|
|
734
964
|
def gen_metastore_containers(
|
|
@@ -749,6 +979,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
749
979
|
|
|
750
980
|
def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
|
|
751
981
|
domain_urn = self._gen_domain_urn(catalog.name)
|
|
982
|
+
catalog_tags = []
|
|
983
|
+
if self.config.include_tags:
|
|
984
|
+
catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
|
|
985
|
+
catalog.name
|
|
986
|
+
).get(catalog.name, [])
|
|
987
|
+
logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
|
|
988
|
+
# Generate platform resources for schema tags
|
|
989
|
+
yield from self.gen_platform_resources(catalog_tags)
|
|
752
990
|
|
|
753
991
|
catalog_container_key = self.gen_catalog_key(catalog)
|
|
754
992
|
yield from gen_containers(
|
|
@@ -764,6 +1002,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
764
1002
|
description=catalog.comment,
|
|
765
1003
|
owner_urn=self.get_owner_urn(catalog.owner),
|
|
766
1004
|
external_url=f"{self.external_url_base}/{catalog.name}",
|
|
1005
|
+
tags=[tag.to_datahub_tag_urn().name for tag in catalog_tags]
|
|
1006
|
+
if catalog_tags
|
|
1007
|
+
else None,
|
|
767
1008
|
)
|
|
768
1009
|
|
|
769
1010
|
def gen_schema_key(self, schema: Schema) -> ContainerKey:
|
|
@@ -832,6 +1073,50 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
832
1073
|
dataset_urn=dataset_urn,
|
|
833
1074
|
)
|
|
834
1075
|
|
|
1076
|
+
def add_model_to_schema_container(
|
|
1077
|
+
self, model_urn: str, schema: Schema
|
|
1078
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1079
|
+
schema_container_key = self.gen_schema_key(schema)
|
|
1080
|
+
yield from add_entity_to_container(
|
|
1081
|
+
container_key=schema_container_key,
|
|
1082
|
+
entity_type=MlModelGroupUrn.ENTITY_TYPE,
|
|
1083
|
+
entity_urn=model_urn,
|
|
1084
|
+
)
|
|
1085
|
+
|
|
1086
|
+
def add_model_version_to_schema_container(
|
|
1087
|
+
self, model_version_urn: str, schema: Schema
|
|
1088
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1089
|
+
schema_container_key = self.gen_schema_key(schema)
|
|
1090
|
+
yield from add_entity_to_container(
|
|
1091
|
+
container_key=schema_container_key,
|
|
1092
|
+
entity_type=MlModelUrn.ENTITY_TYPE,
|
|
1093
|
+
entity_urn=model_version_urn,
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
def _get_catalog_tags(
|
|
1097
|
+
self, catalog: str, schema: str, table: str
|
|
1098
|
+
) -> List[UnityCatalogTag]:
|
|
1099
|
+
all_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog)
|
|
1100
|
+
return all_tags.get(f"{catalog}", [])
|
|
1101
|
+
|
|
1102
|
+
def _get_schema_tags(
|
|
1103
|
+
self, catalog: str, schema: str, table: str
|
|
1104
|
+
) -> List[UnityCatalogTag]:
|
|
1105
|
+
all_tags = self.unity_catalog_api_proxy.get_schema_tags(catalog)
|
|
1106
|
+
return all_tags.get(f"{catalog}.{schema}", [])
|
|
1107
|
+
|
|
1108
|
+
def _get_table_tags(
|
|
1109
|
+
self, catalog: str, schema: str, table: str
|
|
1110
|
+
) -> List[UnityCatalogTag]:
|
|
1111
|
+
all_tags = self.unity_catalog_api_proxy.get_table_tags(catalog)
|
|
1112
|
+
return all_tags.get(f"{catalog}.{schema}.{table}", [])
|
|
1113
|
+
|
|
1114
|
+
def _get_column_tags(
|
|
1115
|
+
self, catalog: str, schema: str, table: str, column: str
|
|
1116
|
+
) -> List[UnityCatalogTag]:
|
|
1117
|
+
all_tags = self.unity_catalog_api_proxy.get_column_tags(catalog)
|
|
1118
|
+
return all_tags.get(f"{catalog}.{schema}.{table}.{column}", [])
|
|
1119
|
+
|
|
835
1120
|
def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
|
|
836
1121
|
custom_properties: dict = {}
|
|
837
1122
|
if table.storage_location is not None:
|
|
@@ -860,16 +1145,20 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
860
1145
|
created: Optional[TimeStampClass] = None
|
|
861
1146
|
if table.created_at:
|
|
862
1147
|
custom_properties["created_at"] = str(table.created_at)
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
1148
|
+
created_ts = make_ts_millis(table.created_at)
|
|
1149
|
+
if created_ts is not None:
|
|
1150
|
+
created = TimeStampClass(
|
|
1151
|
+
created_ts,
|
|
1152
|
+
make_user_urn(table.created_by) if table.created_by else None,
|
|
1153
|
+
)
|
|
867
1154
|
last_modified = created
|
|
868
1155
|
if table.updated_at:
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
1156
|
+
updated_ts = make_ts_millis(table.updated_at)
|
|
1157
|
+
if updated_ts is not None:
|
|
1158
|
+
last_modified = TimeStampClass(
|
|
1159
|
+
updated_ts,
|
|
1160
|
+
table.updated_by and make_user_urn(table.updated_by),
|
|
1161
|
+
)
|
|
873
1162
|
|
|
874
1163
|
return DatasetPropertiesClass(
|
|
875
1164
|
name=table.name,
|
|
@@ -921,30 +1210,127 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
921
1210
|
materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
|
|
922
1211
|
)
|
|
923
1212
|
|
|
924
|
-
def
|
|
925
|
-
|
|
1213
|
+
def get_or_create_from_unity_tag(
|
|
1214
|
+
self,
|
|
1215
|
+
unity_tag: UnityCatalogTag,
|
|
1216
|
+
platform_instance: Optional[str],
|
|
1217
|
+
managed_by_datahub: bool = False,
|
|
1218
|
+
) -> UnityCatalogTagPlatformResource:
|
|
1219
|
+
"""
|
|
1220
|
+
Optimized helper to get or create a Unity Catalog tag platform resource.
|
|
1221
|
+
This eliminates the duplicate search by skipping the from_tag method which was
|
|
1222
|
+
doing a redundant search before get_from_datahub.
|
|
1223
|
+
"""
|
|
1224
|
+
# Create the platform resource ID directly without the from_tag search
|
|
1225
|
+
platform_resource_id = UnityCatalogTagPlatformResourceId(
|
|
1226
|
+
tag_key=unity_tag.key.raw_text,
|
|
1227
|
+
tag_value=unity_tag.value.raw_text if unity_tag.value is not None else None,
|
|
1228
|
+
platform_instance=platform_instance,
|
|
1229
|
+
exists_in_unity_catalog=True, # We got it from Unity Catalog
|
|
1230
|
+
persisted=False,
|
|
1231
|
+
)
|
|
1232
|
+
|
|
1233
|
+
# Use the repository's get_entity_from_datahub method which handles
|
|
1234
|
+
# searching and caching internally - this is the ONLY search we need
|
|
1235
|
+
if self.platform_resource_repository is None:
|
|
1236
|
+
raise ValueError("Platform resource repository not initialized")
|
|
1237
|
+
return self.platform_resource_repository.get_entity_from_datahub(
|
|
1238
|
+
platform_resource_id,
|
|
1239
|
+
managed_by_datahub,
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
def gen_platform_resources(
|
|
1243
|
+
self, tags: List[UnityCatalogTag]
|
|
1244
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1245
|
+
if self.ctx.graph and self.platform_resource_repository:
|
|
1246
|
+
for tag in tags:
|
|
1247
|
+
try:
|
|
1248
|
+
# Use optimized helper method that combines ID creation and entity retrieval
|
|
1249
|
+
unity_catalog_tag = self.get_or_create_from_unity_tag(
|
|
1250
|
+
tag,
|
|
1251
|
+
self.platform_instance_name,
|
|
1252
|
+
managed_by_datahub=False,
|
|
1253
|
+
)
|
|
1254
|
+
logger.debug(
|
|
1255
|
+
f"Retrieved/created platform resource for tag {tag.key.raw_text}"
|
|
1256
|
+
)
|
|
1257
|
+
if (
|
|
1258
|
+
tag.to_datahub_tag_urn().urn()
|
|
1259
|
+
not in unity_catalog_tag.datahub_linked_resources().urns
|
|
1260
|
+
):
|
|
1261
|
+
unity_catalog_tag.datahub_linked_resources().add(
|
|
1262
|
+
tag.to_datahub_tag_urn().urn()
|
|
1263
|
+
)
|
|
1264
|
+
platform_resource = unity_catalog_tag.as_platform_resource()
|
|
1265
|
+
for mcp in platform_resource.to_mcps():
|
|
1266
|
+
yield MetadataWorkUnit(
|
|
1267
|
+
id=f"platform_resource-{platform_resource.id}",
|
|
1268
|
+
mcp=mcp,
|
|
1269
|
+
)
|
|
1270
|
+
except Exception as e:
|
|
1271
|
+
logger.exception(
|
|
1272
|
+
f"Error processing platform resource for tag {tag}"
|
|
1273
|
+
)
|
|
1274
|
+
self.report.report_warning(
|
|
1275
|
+
message="Error processing platform resource for tag",
|
|
1276
|
+
context=str(tag),
|
|
1277
|
+
title="Error processing platform resource for tag",
|
|
1278
|
+
exc=e,
|
|
1279
|
+
)
|
|
1280
|
+
continue
|
|
926
1281
|
|
|
1282
|
+
def _create_schema_metadata_aspect(
|
|
1283
|
+
self, table: Table
|
|
1284
|
+
) -> Tuple[SchemaMetadataClass, Iterable[MetadataWorkUnit]]:
|
|
1285
|
+
schema_fields: List[SchemaFieldClass] = []
|
|
1286
|
+
unique_tags: Set[UnityCatalogTag] = set()
|
|
927
1287
|
for column in table.columns:
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
1288
|
+
tag_urns: Optional[List[TagUrn]] = None
|
|
1289
|
+
if self.config.include_tags:
|
|
1290
|
+
column_tags = self._get_column_tags(
|
|
1291
|
+
table.ref.catalog, table.ref.schema, table.ref.table, column.name
|
|
1292
|
+
)
|
|
1293
|
+
unique_tags.update(column_tags)
|
|
1294
|
+
tag_urns = [tag.to_datahub_tag_urn() for tag in column_tags]
|
|
1295
|
+
schema_fields.extend(self._create_schema_field(column, tag_urns))
|
|
1296
|
+
|
|
1297
|
+
platform_resources = self.gen_platform_resources(list(unique_tags))
|
|
1298
|
+
return (
|
|
1299
|
+
SchemaMetadataClass(
|
|
1300
|
+
schemaName=table.id,
|
|
1301
|
+
platform=make_data_platform_urn(self.platform),
|
|
1302
|
+
fields=schema_fields,
|
|
1303
|
+
hash="",
|
|
1304
|
+
version=0,
|
|
1305
|
+
platformSchema=MySqlDDLClass(tableSchema=""),
|
|
1306
|
+
),
|
|
1307
|
+
platform_resources,
|
|
937
1308
|
)
|
|
938
1309
|
|
|
939
1310
|
@staticmethod
|
|
940
|
-
def _create_schema_field(
|
|
1311
|
+
def _create_schema_field(
|
|
1312
|
+
column: Column, tags: Optional[List[TagUrn]]
|
|
1313
|
+
) -> List[SchemaFieldClass]:
|
|
941
1314
|
_COMPLEX_TYPE = re.compile("^(struct|array)")
|
|
942
|
-
|
|
1315
|
+
global_tags: Optional[GlobalTags] = None
|
|
943
1316
|
if _COMPLEX_TYPE.match(column.type_text.lower()):
|
|
944
1317
|
return get_schema_fields_for_hive_column(
|
|
945
1318
|
column.name, column.type_text.lower(), description=column.comment
|
|
946
1319
|
)
|
|
947
1320
|
else:
|
|
1321
|
+
if tags is not None:
|
|
1322
|
+
attribution = MetadataAttribution(
|
|
1323
|
+
source="urn:li:dataPlatform:unity-catalog",
|
|
1324
|
+
actor="urn:li:corpuser:datahub",
|
|
1325
|
+
time=int(time.time() * 1000),
|
|
1326
|
+
)
|
|
1327
|
+
global_tags = GlobalTags(
|
|
1328
|
+
tags=[
|
|
1329
|
+
TagAssociation(tag=tag.urn(), attribution=attribution)
|
|
1330
|
+
for tag in tags
|
|
1331
|
+
]
|
|
1332
|
+
)
|
|
1333
|
+
|
|
948
1334
|
return [
|
|
949
1335
|
SchemaFieldClass(
|
|
950
1336
|
fieldPath=column.name,
|
|
@@ -954,78 +1340,26 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
954
1340
|
nativeDataType=column.type_text,
|
|
955
1341
|
nullable=column.nullable,
|
|
956
1342
|
description=column.comment,
|
|
1343
|
+
globalTags=global_tags if tags else None,
|
|
957
1344
|
)
|
|
958
1345
|
]
|
|
959
1346
|
|
|
960
|
-
def _run_sql_parser(
|
|
961
|
-
self, view_ref: TableReference, query: str, schema_resolver: SchemaResolver
|
|
962
|
-
) -> Optional[SqlParsingResult]:
|
|
963
|
-
raw_lineage = sqlglot_lineage(
|
|
964
|
-
query,
|
|
965
|
-
schema_resolver=schema_resolver,
|
|
966
|
-
default_db=view_ref.catalog,
|
|
967
|
-
default_schema=view_ref.schema,
|
|
968
|
-
)
|
|
969
|
-
view_urn = self.gen_dataset_urn(view_ref)
|
|
970
|
-
|
|
971
|
-
if raw_lineage.debug_info.table_error:
|
|
972
|
-
logger.debug(
|
|
973
|
-
f"Failed to parse lineage for view {view_ref}: "
|
|
974
|
-
f"{raw_lineage.debug_info.table_error}"
|
|
975
|
-
)
|
|
976
|
-
self.report.num_view_definitions_failed_parsing += 1
|
|
977
|
-
self.report.view_definitions_parsing_failures.append(
|
|
978
|
-
f"Table-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.table_error}"
|
|
979
|
-
)
|
|
980
|
-
return None
|
|
981
|
-
|
|
982
|
-
elif raw_lineage.debug_info.column_error:
|
|
983
|
-
self.report.num_view_definitions_failed_column_parsing += 1
|
|
984
|
-
self.report.view_definitions_parsing_failures.append(
|
|
985
|
-
f"Column-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.column_error}"
|
|
986
|
-
)
|
|
987
|
-
else:
|
|
988
|
-
self.report.num_view_definitions_parsed += 1
|
|
989
|
-
if raw_lineage.out_tables != [view_urn]:
|
|
990
|
-
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
991
|
-
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
992
|
-
|
|
993
1347
|
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
994
1348
|
if not (
|
|
995
1349
|
self.config.include_hive_metastore
|
|
996
1350
|
and self.config.include_table_lineage
|
|
997
|
-
and self.
|
|
1351
|
+
and self.sql_parsing_aggregator
|
|
998
1352
|
):
|
|
999
1353
|
return
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
generate_usage_statistics=False,
|
|
1004
|
-
generate_operations=False,
|
|
1005
|
-
)
|
|
1006
|
-
for dataset_name in self.view_definitions.keys():
|
|
1007
|
-
view_ref, view_definition = self.view_definitions[dataset_name]
|
|
1008
|
-
result = self._run_sql_parser(
|
|
1009
|
-
view_ref,
|
|
1010
|
-
view_definition,
|
|
1011
|
-
self.sql_parser_schema_resolver,
|
|
1012
|
-
)
|
|
1013
|
-
if result and result.out_tables:
|
|
1014
|
-
# This does not yield any workunits but we use
|
|
1015
|
-
# yield here to execute this method
|
|
1016
|
-
yield from builder.process_sql_parsing_result(
|
|
1017
|
-
result=result,
|
|
1018
|
-
query=view_definition,
|
|
1019
|
-
is_view_ddl=True,
|
|
1020
|
-
include_column_lineage=self.config.include_view_column_lineage,
|
|
1021
|
-
)
|
|
1022
|
-
yield from builder.gen_workunits()
|
|
1354
|
+
|
|
1355
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
1356
|
+
yield mcp.as_workunit()
|
|
1023
1357
|
|
|
1024
1358
|
def close(self):
|
|
1025
1359
|
if self.hive_metastore_proxy:
|
|
1026
1360
|
self.hive_metastore_proxy.close()
|
|
1027
|
-
if self.
|
|
1028
|
-
self.
|
|
1361
|
+
if self.sql_parsing_aggregator:
|
|
1362
|
+
self.sql_parsing_aggregator.close()
|
|
1029
1363
|
if self.sql_parser_schema_resolver:
|
|
1030
1364
|
self.sql_parser_schema_resolver.close()
|
|
1031
1365
|
|