acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
import typing
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import (
|
|
8
|
+
Generic,
|
|
9
|
+
Iterable,
|
|
10
|
+
List,
|
|
11
|
+
Optional,
|
|
12
|
+
Protocol,
|
|
13
|
+
Type,
|
|
14
|
+
TypeVar,
|
|
15
|
+
Union,
|
|
16
|
+
cast,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
import cachetools
|
|
20
|
+
from pydantic import BaseModel
|
|
21
|
+
from typing_extensions import get_original_bases
|
|
22
|
+
|
|
23
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
24
|
+
PlatformResource,
|
|
25
|
+
PlatformResourceKey,
|
|
26
|
+
PlatformResourceSearchFields,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.api.report import SupportsAsObj
|
|
29
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
30
|
+
from datahub.metadata.urns import PlatformResourceUrn, Urn
|
|
31
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# Type variables for generic repository
|
|
36
|
+
TExternalEntityId = TypeVar("TExternalEntityId", bound="ExternalEntityId")
|
|
37
|
+
TExternalEntity = TypeVar("TExternalEntity", bound="ExternalEntity")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class UrnCacheKey:
|
|
42
|
+
"""Typed compound key for URN search cache.
|
|
43
|
+
|
|
44
|
+
This eliminates fragile string parsing and provides type safety for cache operations.
|
|
45
|
+
Using dataclass with frozen=True makes it immutable and hashable.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
urn: str
|
|
49
|
+
platform_instance: Optional[str]
|
|
50
|
+
|
|
51
|
+
def __str__(self) -> str:
|
|
52
|
+
"""String representation for debugging purposes only."""
|
|
53
|
+
return (
|
|
54
|
+
f"UrnCacheKey(urn={self.urn}, platform_instance={self.platform_instance})"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SyncContext(Protocol):
|
|
59
|
+
"""Protocol defining the interface for platform-specific sync context objects.
|
|
60
|
+
|
|
61
|
+
All sync context objects must have a platform_instance attribute that can be None.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
platform_instance: Optional[str]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PlatformResourceRepository(
|
|
68
|
+
SupportsAsObj, ABC, Generic[TExternalEntityId, TExternalEntity]
|
|
69
|
+
):
|
|
70
|
+
CACHE_SIZE = 1000
|
|
71
|
+
|
|
72
|
+
# Subclasses should override this with their specific entity class
|
|
73
|
+
entity_class: Type[TExternalEntity]
|
|
74
|
+
|
|
75
|
+
def __init__(self, graph: DataHubGraph, platform_instance: Optional[str] = None):
|
|
76
|
+
self.graph = graph
|
|
77
|
+
self.platform_instance = platform_instance
|
|
78
|
+
|
|
79
|
+
# Extract the entity class from generic type parameters
|
|
80
|
+
# self.entity_class = typing.get_args(self.__class__.__orig_bases__[0])[1]
|
|
81
|
+
self.entity_class = typing.get_args(get_original_bases(self.__class__)[0])[1]
|
|
82
|
+
|
|
83
|
+
# Two-tier cache architecture for efficient external entity management
|
|
84
|
+
# URN search cache: maps UrnCacheKey -> ExternalEntityId
|
|
85
|
+
self.urn_search_cache: cachetools.LRUCache[
|
|
86
|
+
UrnCacheKey, Optional[TExternalEntityId]
|
|
87
|
+
] = cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
|
|
88
|
+
# External entity cache: maps platform_resource_key.id -> ExternalEntity
|
|
89
|
+
self.external_entity_cache: cachetools.LRUCache[str, TExternalEntity] = (
|
|
90
|
+
cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Statistics tracking - simple integers following DataHub report patterns
|
|
94
|
+
self.urn_search_cache_hits = 0
|
|
95
|
+
self.urn_search_cache_misses = 0
|
|
96
|
+
self.external_entity_cache_hits = 0
|
|
97
|
+
self.external_entity_cache_misses = 0
|
|
98
|
+
|
|
99
|
+
# Error tracking for cache operations
|
|
100
|
+
self.cache_update_errors = 0
|
|
101
|
+
self.cache_invalidation_errors = 0
|
|
102
|
+
self.entity_creation_errors = 0
|
|
103
|
+
self.cache_key_parsing_errors = 0
|
|
104
|
+
|
|
105
|
+
# Thread safety infrastructure
|
|
106
|
+
# Use RLock to allow recursive acquisition within the same thread
|
|
107
|
+
self._cache_lock = threading.RLock()
|
|
108
|
+
|
|
109
|
+
def search_by_filter(
|
|
110
|
+
self, query: ElasticDocumentQuery, add_to_cache: bool = True
|
|
111
|
+
) -> Iterable[PlatformResource]:
|
|
112
|
+
results = PlatformResource.search_by_filters(self.graph, query)
|
|
113
|
+
# Note: add_to_cache parameter is kept for API compatibility but ignored
|
|
114
|
+
# since we no longer cache raw PlatformResource objects
|
|
115
|
+
for platform_resource in results:
|
|
116
|
+
yield platform_resource
|
|
117
|
+
|
|
118
|
+
def create(self, platform_resource: PlatformResource) -> None:
|
|
119
|
+
"""Create platform resource in DataHub with atomic cache operations.
|
|
120
|
+
|
|
121
|
+
This method ensures thread-safe, atomic updates across both caches.
|
|
122
|
+
"""
|
|
123
|
+
# First, perform the DataHub ingestion outside the cache lock
|
|
124
|
+
platform_resource.to_datahub(self.graph)
|
|
125
|
+
|
|
126
|
+
# Now perform atomic cache operations
|
|
127
|
+
with self._cache_lock:
|
|
128
|
+
# Cache the transformed entity with correct flags after ingestion and update related caches
|
|
129
|
+
if (
|
|
130
|
+
platform_resource.resource_info
|
|
131
|
+
and platform_resource.resource_info.value
|
|
132
|
+
):
|
|
133
|
+
try:
|
|
134
|
+
# Extract the original entity from the serialized resource value
|
|
135
|
+
entity_obj = (
|
|
136
|
+
platform_resource.resource_info.value.as_pydantic_object(
|
|
137
|
+
self.entity_class
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
entity = self.entity_class(**entity_obj.dict())
|
|
141
|
+
|
|
142
|
+
# Create updated entity ID with persisted=True
|
|
143
|
+
entity_id = entity.get_id()
|
|
144
|
+
if hasattr(entity_id, "dict"):
|
|
145
|
+
entity_id_data = entity_id.dict()
|
|
146
|
+
entity_id_data["persisted"] = True
|
|
147
|
+
|
|
148
|
+
# Create new entity ID with updated flags
|
|
149
|
+
updated_entity_id = type(entity_id)(**entity_id_data)
|
|
150
|
+
|
|
151
|
+
# Update the entity with the new ID (immutable update)
|
|
152
|
+
entity_data = entity.dict() # type: ignore[attr-defined]
|
|
153
|
+
entity_data["id"] = updated_entity_id
|
|
154
|
+
updated_entity = type(entity)(**entity_data)
|
|
155
|
+
|
|
156
|
+
# Cache the updated entity in the external entity cache
|
|
157
|
+
# Use the same cache key that get_entity_from_datahub uses
|
|
158
|
+
updated_platform_resource_key = (
|
|
159
|
+
updated_entity_id.to_platform_resource_key()
|
|
160
|
+
)
|
|
161
|
+
self.external_entity_cache[updated_platform_resource_key.id] = (
|
|
162
|
+
updated_entity
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Update URN search cache for any URNs associated with this entity
|
|
166
|
+
# This ensures that future URN searches will find the newly created entity
|
|
167
|
+
if (
|
|
168
|
+
platform_resource.resource_info
|
|
169
|
+
and platform_resource.resource_info.secondary_keys
|
|
170
|
+
):
|
|
171
|
+
for (
|
|
172
|
+
secondary_key
|
|
173
|
+
) in platform_resource.resource_info.secondary_keys:
|
|
174
|
+
# Create typed compound cache key
|
|
175
|
+
urn_cache_key = UrnCacheKey(
|
|
176
|
+
urn=secondary_key,
|
|
177
|
+
platform_instance=self.platform_instance,
|
|
178
|
+
)
|
|
179
|
+
# Cache the updated entity ID so URN searches will find it
|
|
180
|
+
self.urn_search_cache[urn_cache_key] = cast(
|
|
181
|
+
TExternalEntityId, updated_entity_id
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Also check if there are any None cache entries that should be invalidated
|
|
185
|
+
# Look for cache entries that were previously searched but not found
|
|
186
|
+
stale_cache_keys = []
|
|
187
|
+
if (
|
|
188
|
+
platform_resource.resource_info
|
|
189
|
+
and platform_resource.resource_info.secondary_keys
|
|
190
|
+
):
|
|
191
|
+
for cache_key, cached_value in list(
|
|
192
|
+
self.urn_search_cache.items()
|
|
193
|
+
):
|
|
194
|
+
if cached_value is None:
|
|
195
|
+
# Direct attribute access on typed key - no parsing needed!
|
|
196
|
+
try:
|
|
197
|
+
# Check if this cache key refers to this entity
|
|
198
|
+
if (
|
|
199
|
+
cache_key.platform_instance
|
|
200
|
+
== self.platform_instance
|
|
201
|
+
and cache_key.urn
|
|
202
|
+
in platform_resource.resource_info.secondary_keys
|
|
203
|
+
):
|
|
204
|
+
stale_cache_keys.append(cache_key)
|
|
205
|
+
except Exception as cache_key_error:
|
|
206
|
+
# Track cache key processing errors and log them
|
|
207
|
+
self.cache_key_parsing_errors += 1
|
|
208
|
+
logger.warning(
|
|
209
|
+
f"Failed to process cache key '{cache_key}' during stale cache invalidation: {cache_key_error}"
|
|
210
|
+
)
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
# Remove stale None cache entries and replace with the actual entity ID
|
|
214
|
+
for stale_key in stale_cache_keys:
|
|
215
|
+
try:
|
|
216
|
+
del self.urn_search_cache[stale_key]
|
|
217
|
+
self.urn_search_cache[stale_key] = cast(
|
|
218
|
+
TExternalEntityId, updated_entity_id
|
|
219
|
+
)
|
|
220
|
+
except Exception as invalidation_error:
|
|
221
|
+
# Track cache invalidation errors
|
|
222
|
+
self.cache_invalidation_errors += 1
|
|
223
|
+
logger.warning(
|
|
224
|
+
f"Failed to invalidate stale cache entry '{stale_key}': {invalidation_error}"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
except Exception as cache_error:
|
|
228
|
+
# Track cache update errors and log them
|
|
229
|
+
self.cache_update_errors += 1
|
|
230
|
+
logger.error(
|
|
231
|
+
f"Failed to update caches after entity creation for resource {platform_resource.id}: {cache_error}"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
|
|
235
|
+
"""Retrieve platform resource by performing a direct DataHub query.
|
|
236
|
+
|
|
237
|
+
Note: This method no longer uses caching since we eliminated the
|
|
238
|
+
platform_resource_cache in favor of the more useful external_entity_cache.
|
|
239
|
+
"""
|
|
240
|
+
# Query DataHub directly for the platform resource
|
|
241
|
+
platform_resources = list(
|
|
242
|
+
self.search_by_filter(
|
|
243
|
+
ElasticDocumentQuery.create_from(
|
|
244
|
+
(
|
|
245
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
246
|
+
self.get_resource_type(),
|
|
247
|
+
),
|
|
248
|
+
(PlatformResourceSearchFields.PRIMARY_KEY, key.primary_key),
|
|
249
|
+
),
|
|
250
|
+
add_to_cache=False,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Find matching resource by ID
|
|
255
|
+
for platform_resource in platform_resources:
|
|
256
|
+
if platform_resource.id == key.id:
|
|
257
|
+
return platform_resource
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
def delete(self, key: PlatformResourceKey) -> None:
|
|
261
|
+
"""Thread-safe atomic deletion from DataHub and all caches."""
|
|
262
|
+
# First, perform the DataHub deletion outside the cache lock
|
|
263
|
+
self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
|
|
264
|
+
|
|
265
|
+
# Now perform atomic cache cleanup
|
|
266
|
+
with self._cache_lock:
|
|
267
|
+
# Clear external entity cache
|
|
268
|
+
if key.id in self.external_entity_cache:
|
|
269
|
+
del self.external_entity_cache[key.id]
|
|
270
|
+
|
|
271
|
+
# Note: We intentionally do not clear URN search cache entries here
|
|
272
|
+
# The URN cache will naturally expire via LRU eviction, and clearing
|
|
273
|
+
# stale entries would require expensive O(n) iteration over all cache keys.
|
|
274
|
+
# Stale cache entries pointing to deleted entities will return None on
|
|
275
|
+
# subsequent lookups, which is the correct behavior.
|
|
276
|
+
|
|
277
|
+
def get_resource_type(self) -> str:
|
|
278
|
+
"""Get the platform-specific resource type for filtering.
|
|
279
|
+
|
|
280
|
+
Returns the entity class name, which matches the resource type.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Resource type string (e.g., 'UnityCatalogTagPlatformResource')
|
|
284
|
+
"""
|
|
285
|
+
return self.entity_class.__name__
|
|
286
|
+
|
|
287
|
+
def create_default_entity(
|
|
288
|
+
self, entity_id: TExternalEntityId, managed_by_datahub: bool
|
|
289
|
+
) -> TExternalEntity:
|
|
290
|
+
"""Create a default entity when none found in DataHub.
|
|
291
|
+
|
|
292
|
+
This method delegates to the entity class's create_default class method
|
|
293
|
+
to avoid circular dependencies and ensure entity creation logic stays
|
|
294
|
+
with the entity class.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
entity_id: The external entity ID
|
|
298
|
+
managed_by_datahub: Whether the entity is managed by DataHub
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Default entity instance
|
|
302
|
+
"""
|
|
303
|
+
# Call the abstract create_default method on the entity class
|
|
304
|
+
return cast(
|
|
305
|
+
TExternalEntity,
|
|
306
|
+
self.entity_class.create_default(entity_id, managed_by_datahub),
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
def search_entity_by_urn(self, urn: str) -> Optional[TExternalEntityId]:
|
|
310
|
+
"""Search for existing external entity by URN with thread-safe caching.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
urn: The URN to search for
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
External entity ID if found, None otherwise
|
|
317
|
+
"""
|
|
318
|
+
# Create typed compound cache key
|
|
319
|
+
cache_key = UrnCacheKey(urn=urn, platform_instance=self.platform_instance)
|
|
320
|
+
|
|
321
|
+
# Thread-safe cache check
|
|
322
|
+
with self._cache_lock:
|
|
323
|
+
if cache_key in self.urn_search_cache:
|
|
324
|
+
cached_result = self.urn_search_cache[cache_key]
|
|
325
|
+
# Update statistics within cache lock following DataHub patterns
|
|
326
|
+
self.urn_search_cache_hits += 1
|
|
327
|
+
logger.debug(f"Cache hit for URN search: {cache_key}")
|
|
328
|
+
return cached_result
|
|
329
|
+
|
|
330
|
+
self.urn_search_cache_misses += 1
|
|
331
|
+
|
|
332
|
+
logger.debug(
|
|
333
|
+
f"Cache miss for URN {urn} with platform instance {self.platform_instance}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
mapped_entities = [
|
|
337
|
+
t
|
|
338
|
+
for t in self.search_by_filter(
|
|
339
|
+
ElasticDocumentQuery.create_from(
|
|
340
|
+
(
|
|
341
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
342
|
+
self.get_resource_type(),
|
|
343
|
+
),
|
|
344
|
+
(PlatformResourceSearchFields.SECONDARY_KEYS, urn),
|
|
345
|
+
),
|
|
346
|
+
add_to_cache=False, # We'll cache the result ourselves
|
|
347
|
+
)
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
result = None
|
|
351
|
+
if len(mapped_entities) > 0:
|
|
352
|
+
for platform_resource in mapped_entities:
|
|
353
|
+
if (
|
|
354
|
+
platform_resource.resource_info
|
|
355
|
+
and platform_resource.resource_info.value
|
|
356
|
+
):
|
|
357
|
+
entity_obj = (
|
|
358
|
+
platform_resource.resource_info.value.as_pydantic_object(
|
|
359
|
+
self.entity_class
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
entity = self.entity_class(**entity_obj.dict())
|
|
363
|
+
# Check if platform instance matches
|
|
364
|
+
entity_id = entity.get_id()
|
|
365
|
+
if entity_id.platform_instance == self.platform_instance:
|
|
366
|
+
# Create a new entity ID with the correct state instead of mutating
|
|
367
|
+
# All our entity IDs are Pydantic models, so we can use dict() method
|
|
368
|
+
entity_data = entity_id.dict()
|
|
369
|
+
entity_data["persisted"] = (
|
|
370
|
+
True # This entity was found in DataHub
|
|
371
|
+
)
|
|
372
|
+
result = cast(TExternalEntityId, type(entity_id)(**entity_data))
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
# Thread-safe cache update of the result (even if None)
|
|
376
|
+
with self._cache_lock:
|
|
377
|
+
self.urn_search_cache[cache_key] = result
|
|
378
|
+
return result
|
|
379
|
+
|
|
380
|
+
def get_entity_from_datahub(
|
|
381
|
+
self, entity_id: TExternalEntityId, managed_by_datahub: bool = False
|
|
382
|
+
) -> TExternalEntity:
|
|
383
|
+
"""Get external entity from DataHub with caching.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
entity_id: The external entity ID to retrieve
|
|
387
|
+
managed_by_datahub: Whether the entity is managed by DataHub
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
External entity if found or created
|
|
391
|
+
"""
|
|
392
|
+
platform_resource_key = entity_id.to_platform_resource_key()
|
|
393
|
+
cache_key = platform_resource_key.id
|
|
394
|
+
|
|
395
|
+
# Thread-safe cache check
|
|
396
|
+
with self._cache_lock:
|
|
397
|
+
cached_result = self.external_entity_cache.get(cache_key)
|
|
398
|
+
if cached_result is not None:
|
|
399
|
+
# Update statistics within cache lock following DataHub patterns
|
|
400
|
+
self.external_entity_cache_hits += 1
|
|
401
|
+
logger.debug(f"Cache hit for get_entity_from_datahub: {cache_key}")
|
|
402
|
+
return cached_result
|
|
403
|
+
|
|
404
|
+
# Cache miss - update statistics within cache lock
|
|
405
|
+
self.external_entity_cache_misses += 1
|
|
406
|
+
logger.debug(f"Cache miss for get_entity_from_datahub {entity_id}")
|
|
407
|
+
|
|
408
|
+
platform_resources = [
|
|
409
|
+
r
|
|
410
|
+
for r in self.search_by_filter(
|
|
411
|
+
ElasticDocumentQuery.create_from(
|
|
412
|
+
(
|
|
413
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
414
|
+
self.get_resource_type(),
|
|
415
|
+
),
|
|
416
|
+
(
|
|
417
|
+
PlatformResourceSearchFields.PRIMARY_KEY,
|
|
418
|
+
platform_resource_key.primary_key,
|
|
419
|
+
),
|
|
420
|
+
),
|
|
421
|
+
add_to_cache=False, # We'll cache the result ourselves
|
|
422
|
+
)
|
|
423
|
+
]
|
|
424
|
+
|
|
425
|
+
result = None
|
|
426
|
+
|
|
427
|
+
if len(platform_resources) == 1:
|
|
428
|
+
platform_resource = platform_resources[0]
|
|
429
|
+
if (
|
|
430
|
+
platform_resource.resource_info
|
|
431
|
+
and platform_resource.resource_info.value
|
|
432
|
+
):
|
|
433
|
+
entity_obj = platform_resource.resource_info.value.as_pydantic_object(
|
|
434
|
+
self.entity_class
|
|
435
|
+
)
|
|
436
|
+
result = self.entity_class(**entity_obj.dict())
|
|
437
|
+
elif len(platform_resources) > 1:
|
|
438
|
+
# Handle multiple matches - find the one with matching platform instance
|
|
439
|
+
target_platform_instance = entity_id.platform_instance
|
|
440
|
+
for platform_resource in platform_resources:
|
|
441
|
+
if (
|
|
442
|
+
platform_resource.resource_info
|
|
443
|
+
and platform_resource.resource_info.value
|
|
444
|
+
):
|
|
445
|
+
entity_obj = (
|
|
446
|
+
platform_resource.resource_info.value.as_pydantic_object(
|
|
447
|
+
self.entity_class
|
|
448
|
+
)
|
|
449
|
+
)
|
|
450
|
+
entity = self.entity_class(**entity_obj.dict())
|
|
451
|
+
if entity.get_id().platform_instance == target_platform_instance:
|
|
452
|
+
result = entity
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
if result is None:
|
|
456
|
+
try:
|
|
457
|
+
result = self.create_default_entity(entity_id, managed_by_datahub)
|
|
458
|
+
except Exception as create_error:
|
|
459
|
+
# Track entity creation errors
|
|
460
|
+
self.entity_creation_errors += 1
|
|
461
|
+
logger.error(
|
|
462
|
+
f"Failed to create default entity for {entity_id}: {create_error}"
|
|
463
|
+
)
|
|
464
|
+
raise
|
|
465
|
+
|
|
466
|
+
# Thread-safe cache update
|
|
467
|
+
with self._cache_lock:
|
|
468
|
+
self.external_entity_cache[cache_key] = result
|
|
469
|
+
return result
|
|
470
|
+
|
|
471
|
+
def as_obj(self) -> dict:
|
|
472
|
+
"""Implementation of SupportsAsObj protocol for automatic report serialization.
|
|
473
|
+
|
|
474
|
+
Returns cache statistics and error metrics on demand when the repository is included in a report.
|
|
475
|
+
This eliminates the need for manual cache statistics collection.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Dictionary containing cache statistics and error metrics with structure:
|
|
479
|
+
{
|
|
480
|
+
"search_by_urn_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
|
|
481
|
+
"external_entity_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
|
|
482
|
+
"errors": {"cache_updates": int, "cache_invalidations": int, "entity_creations": int, "cache_key_parsing": int}
|
|
483
|
+
}
|
|
484
|
+
"""
|
|
485
|
+
return {
|
|
486
|
+
"search_by_urn_cache": {
|
|
487
|
+
"hits": self.urn_search_cache_hits,
|
|
488
|
+
"misses": self.urn_search_cache_misses,
|
|
489
|
+
"current_size": len(self.urn_search_cache),
|
|
490
|
+
"max_size": int(self.urn_search_cache.maxsize),
|
|
491
|
+
},
|
|
492
|
+
"external_entity_cache": {
|
|
493
|
+
"hits": self.external_entity_cache_hits,
|
|
494
|
+
"misses": self.external_entity_cache_misses,
|
|
495
|
+
"current_size": len(self.external_entity_cache),
|
|
496
|
+
"max_size": int(self.external_entity_cache.maxsize),
|
|
497
|
+
},
|
|
498
|
+
"errors": {
|
|
499
|
+
"cache_updates": self.cache_update_errors,
|
|
500
|
+
"cache_invalidations": self.cache_invalidation_errors,
|
|
501
|
+
"entity_creations": self.entity_creation_errors,
|
|
502
|
+
"cache_key_parsing": self.cache_key_parsing_errors,
|
|
503
|
+
},
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
class ExternalEntityId(BaseModel):
|
|
508
|
+
"""
|
|
509
|
+
ExternalEntityId is a unique
|
|
510
|
+
identifier for an ExternalEntity.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
platform_instance: Optional[str] = None
|
|
514
|
+
|
|
515
|
+
@abstractmethod
|
|
516
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
517
|
+
"""
|
|
518
|
+
Converts the ExternalEntityId to a PlatformResourceKey.
|
|
519
|
+
"""
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
class CaseSensitivity(Enum):
|
|
524
|
+
UPPER = "upper"
|
|
525
|
+
LOWER = "lower"
|
|
526
|
+
MIXED = "mixed"
|
|
527
|
+
|
|
528
|
+
@staticmethod
|
|
529
|
+
def detect_case_sensitivity(value: str) -> "CaseSensitivity":
|
|
530
|
+
if value.isupper():
|
|
531
|
+
return CaseSensitivity.UPPER
|
|
532
|
+
elif value.islower():
|
|
533
|
+
return CaseSensitivity.LOWER
|
|
534
|
+
return CaseSensitivity.MIXED
|
|
535
|
+
|
|
536
|
+
@staticmethod
|
|
537
|
+
def detect_for_many(values: List[str]) -> "CaseSensitivity":
|
|
538
|
+
"""
|
|
539
|
+
Detects the case sensitivity for a list of strings.
|
|
540
|
+
Returns CaseSensitivity.MIXED if the case sensitivity is mixed.
|
|
541
|
+
"""
|
|
542
|
+
if len(values) == 0:
|
|
543
|
+
return CaseSensitivity.MIXED
|
|
544
|
+
|
|
545
|
+
if all(
|
|
546
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.UPPER
|
|
547
|
+
for value in values
|
|
548
|
+
):
|
|
549
|
+
return CaseSensitivity.UPPER
|
|
550
|
+
elif all(
|
|
551
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.LOWER
|
|
552
|
+
for value in values
|
|
553
|
+
):
|
|
554
|
+
return CaseSensitivity.LOWER
|
|
555
|
+
return CaseSensitivity.MIXED
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
class LinkedResourceSet(BaseModel):
|
|
559
|
+
"""
|
|
560
|
+
A LinkedResourceSet is a set of DataHub URNs that are linked to an ExternalEntity.
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
urns: List[str]
|
|
564
|
+
|
|
565
|
+
def _has_conflict(self, urn: Urn) -> bool:
|
|
566
|
+
"""
|
|
567
|
+
Detects if the urn is safe to add into the set
|
|
568
|
+
This is used to detect conflicts between DataHub URNs that are linked to
|
|
569
|
+
the same ExternalEntity.
|
|
570
|
+
e.g. Case sensitivity of URNs
|
|
571
|
+
Mixing tags and terms in the same set etc.
|
|
572
|
+
Return True if the urn is not safe to add into the set, else False.
|
|
573
|
+
If the urn is already in the set, we don't need to add it again, but
|
|
574
|
+
that is not a conflict.
|
|
575
|
+
"""
|
|
576
|
+
if urn.urn() in self.urns:
|
|
577
|
+
return False
|
|
578
|
+
|
|
579
|
+
# Detect the entity_type of the urns in the existing set
|
|
580
|
+
detected_entity_type = None
|
|
581
|
+
for existing_urn in self.urns:
|
|
582
|
+
try:
|
|
583
|
+
parsed_urn = Urn.from_string(existing_urn)
|
|
584
|
+
entity_type = parsed_urn.entity_type
|
|
585
|
+
if detected_entity_type is None:
|
|
586
|
+
detected_entity_type = entity_type
|
|
587
|
+
elif detected_entity_type != entity_type:
|
|
588
|
+
logger.warning(
|
|
589
|
+
f"Detected entity_type {detected_entity_type} is not equals to {entity_type}"
|
|
590
|
+
)
|
|
591
|
+
return True
|
|
592
|
+
except ValueError:
|
|
593
|
+
# Not a valid URN
|
|
594
|
+
logger.warning(f"Invalid URN {existing_urn} in LinkedResourceSet")
|
|
595
|
+
return True
|
|
596
|
+
try:
|
|
597
|
+
parsed_urn = urn
|
|
598
|
+
if (
|
|
599
|
+
detected_entity_type is not None
|
|
600
|
+
and parsed_urn.entity_type != detected_entity_type
|
|
601
|
+
):
|
|
602
|
+
logger.warning(
|
|
603
|
+
f"Detected entity_type {detected_entity_type} is not equals to parsed_urn's entity_type: {parsed_urn.entity_type}"
|
|
604
|
+
)
|
|
605
|
+
return True
|
|
606
|
+
except ValueError:
|
|
607
|
+
# Not a valid URN
|
|
608
|
+
logger.warning(f"Invalid URN: {urn} in LinkedResourceSet")
|
|
609
|
+
return True
|
|
610
|
+
return False
|
|
611
|
+
|
|
612
|
+
def add(self, urn: Union[str, Urn]) -> bool:
|
|
613
|
+
"""
|
|
614
|
+
Adds a URN to the set.
|
|
615
|
+
Returns True if the URN was added, False if it was already in the set.
|
|
616
|
+
Raises a ValueError if the URN is in conflict with the existing set.
|
|
617
|
+
"""
|
|
618
|
+
# Deduplicate the URNs if we have somehow duplicate items from concurrent runs
|
|
619
|
+
self.urns = list(set(self.urns))
|
|
620
|
+
if isinstance(urn, str):
|
|
621
|
+
urn = Urn.from_string(urn)
|
|
622
|
+
if self._has_conflict(urn):
|
|
623
|
+
raise ValueError(f"Conflict detected when adding URN {urn} to the set")
|
|
624
|
+
if urn.urn() not in self.urns:
|
|
625
|
+
self.urns.append(urn.urn())
|
|
626
|
+
return True
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
class ExternalEntity(BaseModel):
|
|
631
|
+
"""
|
|
632
|
+
An ExternalEntity is a representation of an entity that external to DataHub
|
|
633
|
+
but could be linked to one or more DataHub entities.
|
|
634
|
+
"""
|
|
635
|
+
|
|
636
|
+
@abstractmethod
|
|
637
|
+
def is_managed_by_datahub(self) -> bool:
|
|
638
|
+
"""
|
|
639
|
+
Returns whether the entity is managed by DataHub.
|
|
640
|
+
"""
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
@abstractmethod
|
|
644
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
645
|
+
"""
|
|
646
|
+
Returns the URNs of the DataHub entities linked to the external entity.
|
|
647
|
+
Empty list if no linked entities.
|
|
648
|
+
"""
|
|
649
|
+
pass
|
|
650
|
+
|
|
651
|
+
@abstractmethod
|
|
652
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
653
|
+
"""
|
|
654
|
+
Converts the ExternalEntity to a PlatformResource.
|
|
655
|
+
"""
|
|
656
|
+
pass
|
|
657
|
+
|
|
658
|
+
@abstractmethod
|
|
659
|
+
def get_id(self) -> ExternalEntityId:
|
|
660
|
+
"""
|
|
661
|
+
Returns the ExternalEntityId for the ExternalEntity.
|
|
662
|
+
"""
|
|
663
|
+
pass
|
|
664
|
+
|
|
665
|
+
@classmethod
|
|
666
|
+
@abstractmethod
|
|
667
|
+
def create_default(
|
|
668
|
+
cls, entity_id: "ExternalEntityId", managed_by_datahub: bool
|
|
669
|
+
) -> "ExternalEntity":
|
|
670
|
+
"""
|
|
671
|
+
Create a default entity instance when none found in DataHub.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
entity_id: The external entity ID (concrete implementations can expect their specific types)
|
|
675
|
+
managed_by_datahub: Whether the entity is managed by DataHub
|
|
676
|
+
|
|
677
|
+
Returns:
|
|
678
|
+
Default entity instance
|
|
679
|
+
"""
|
|
680
|
+
pass
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
class MissingExternalEntity(ExternalEntity):
|
|
684
|
+
id: ExternalEntityId
|
|
685
|
+
|
|
686
|
+
def is_managed_by_datahub(self) -> bool:
|
|
687
|
+
return False
|
|
688
|
+
|
|
689
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
690
|
+
return LinkedResourceSet(urns=[])
|
|
691
|
+
|
|
692
|
+
def as_platform_resource(self) -> Optional[PlatformResource]: # type: ignore[override]
|
|
693
|
+
return None
|
|
694
|
+
|
|
695
|
+
def get_id(self) -> ExternalEntityId:
|
|
696
|
+
return self.id
|
|
697
|
+
|
|
698
|
+
@classmethod
|
|
699
|
+
def create_default(
|
|
700
|
+
cls, entity_id: ExternalEntityId, managed_by_datahub: bool
|
|
701
|
+
) -> "MissingExternalEntity":
|
|
702
|
+
"""Create a missing external entity."""
|
|
703
|
+
return cls(id=entity_id)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
class ExternalSystem:
|
|
707
|
+
@abstractmethod
|
|
708
|
+
def exists(self, external_entity_id: ExternalEntityId) -> bool:
|
|
709
|
+
"""
|
|
710
|
+
Returns whether the ExternalEntityId exists in the external system.
|
|
711
|
+
"""
|
|
712
|
+
pass
|
|
713
|
+
|
|
714
|
+
@abstractmethod
|
|
715
|
+
def get(
|
|
716
|
+
self,
|
|
717
|
+
external_entity_id: ExternalEntityId,
|
|
718
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
719
|
+
) -> Optional[ExternalEntity]:
|
|
720
|
+
"""
|
|
721
|
+
Returns the ExternalEntity for the ExternalEntityId.
|
|
722
|
+
Uses the platform resource repository to enrich the ExternalEntity with DataHub URNs.
|
|
723
|
+
"""
|
|
724
|
+
pass
|