acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
import re
|
|
6
|
+
import tempfile
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
List,
|
|
12
|
+
Optional,
|
|
13
|
+
Sequence,
|
|
14
|
+
Set,
|
|
15
|
+
Tuple,
|
|
16
|
+
Type,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
23
|
+
from datahub.ingestion.sink.file import write_metadata_file
|
|
24
|
+
from datahub.metadata.schema_classes import MetadataChangeEventClass
|
|
25
|
+
from datahub.metadata.urns import Urn
|
|
26
|
+
from datahub.testing.compare_metadata_json import (
|
|
27
|
+
assert_metadata_files_equal,
|
|
28
|
+
load_json_file,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
IGNORE_PATH_TIMESTAMPS = [
|
|
34
|
+
# Ignore timestamps from the ETL pipeline. A couple examples:
|
|
35
|
+
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['created'\]\['time'\]",
|
|
36
|
+
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['lastModified'\]\['time'\]",
|
|
37
|
+
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['createStamp'\]\['time'\]",
|
|
38
|
+
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['auditStamp'\]\['time'\]",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MCEConstants:
|
|
43
|
+
PROPOSED_SNAPSHOT = "proposedSnapshot"
|
|
44
|
+
DATASET_SNAPSHOT_CLASS = (
|
|
45
|
+
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MCPConstants:
|
|
50
|
+
CHANGE_TYPE = "changeType"
|
|
51
|
+
ENTITY_URN = "entityUrn"
|
|
52
|
+
ENTITY_TYPE = "entityType"
|
|
53
|
+
ASPECT_NAME = "aspectName"
|
|
54
|
+
ASPECT_VALUE = "aspect"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class EntityType:
|
|
58
|
+
DATASET = "dataset"
|
|
59
|
+
PIPELINE = "dataFlow"
|
|
60
|
+
FLOW = "dataFlow"
|
|
61
|
+
TASK = "dataJob"
|
|
62
|
+
JOB = "dataJob"
|
|
63
|
+
USER = "corpuser"
|
|
64
|
+
GROUP = "corpGroup"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def clean_nones(value):
|
|
68
|
+
"""
|
|
69
|
+
Recursively remove all None values from dictionaries and lists, and returns
|
|
70
|
+
the result as a new dictionary or list.
|
|
71
|
+
"""
|
|
72
|
+
if isinstance(value, list):
|
|
73
|
+
return [clean_nones(x) for x in value if x is not None]
|
|
74
|
+
elif isinstance(value, dict):
|
|
75
|
+
return {key: clean_nones(val) for key, val in value.items() if val is not None}
|
|
76
|
+
else:
|
|
77
|
+
return value
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def check_golden_file(
|
|
81
|
+
pytestconfig: pytest.Config,
|
|
82
|
+
output_path: Union[str, os.PathLike],
|
|
83
|
+
golden_path: Union[str, os.PathLike],
|
|
84
|
+
ignore_paths: Sequence[str] = (),
|
|
85
|
+
ignore_paths_v2: Sequence[str] = (),
|
|
86
|
+
ignore_order: bool = True,
|
|
87
|
+
) -> None:
|
|
88
|
+
# TODO: Remove the pytestconfig parameter since it's redundant.
|
|
89
|
+
# Or more straightforward - we can remove the `check_golden_file` method
|
|
90
|
+
# and use assert_metadata_files_equal directly. Maybe call it "check_golden_metadata"?
|
|
91
|
+
# In a lot of cases, the output_path is also just annoying - our pytest setup
|
|
92
|
+
# should be responsible for figuring out where to put the temp file.
|
|
93
|
+
assert_metadata_files_equal(
|
|
94
|
+
output_path=output_path,
|
|
95
|
+
golden_path=golden_path,
|
|
96
|
+
ignore_paths=ignore_paths,
|
|
97
|
+
ignore_paths_v2=ignore_paths_v2,
|
|
98
|
+
ignore_order=ignore_order,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def check_goldens_stream(
|
|
103
|
+
outputs: List,
|
|
104
|
+
golden_path: Union[str, os.PathLike],
|
|
105
|
+
ignore_paths: Sequence[str] = (),
|
|
106
|
+
ignore_order: bool = True,
|
|
107
|
+
) -> None:
|
|
108
|
+
with tempfile.NamedTemporaryFile() as f:
|
|
109
|
+
write_metadata_file(pathlib.Path(f.name), outputs)
|
|
110
|
+
|
|
111
|
+
assert_metadata_files_equal(
|
|
112
|
+
output_path=f.name,
|
|
113
|
+
golden_path=golden_path,
|
|
114
|
+
ignore_paths=ignore_paths,
|
|
115
|
+
ignore_order=ignore_order,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _get_field_for_entity_type_in_mce(entity_type: str) -> str:
|
|
120
|
+
"""Returns the field to look for depending on the type of entity in the MCE"""
|
|
121
|
+
if entity_type == EntityType.DATASET:
|
|
122
|
+
return MCEConstants.DATASET_SNAPSHOT_CLASS
|
|
123
|
+
raise Exception(f"Not implemented for entity_type {entity_type}")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _get_filter(
|
|
127
|
+
mce: bool = False, mcp: bool = False, entity_type: Optional[str] = None
|
|
128
|
+
) -> Callable[[Dict], bool]:
|
|
129
|
+
if mce:
|
|
130
|
+
# cheap way to determine if we are working with an MCE for the appropriate entity_type
|
|
131
|
+
if entity_type:
|
|
132
|
+
return (
|
|
133
|
+
lambda x: MCEConstants.PROPOSED_SNAPSHOT in x
|
|
134
|
+
and _get_field_for_entity_type_in_mce(str(entity_type))
|
|
135
|
+
in x[MCEConstants.PROPOSED_SNAPSHOT]
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
return lambda x: MCEConstants.PROPOSED_SNAPSHOT in x
|
|
139
|
+
if mcp:
|
|
140
|
+
# cheap way to determine if we are working with an MCP
|
|
141
|
+
return lambda x: MCPConstants.CHANGE_TYPE in x and (
|
|
142
|
+
x[MCPConstants.ENTITY_TYPE] == entity_type if entity_type else True
|
|
143
|
+
)
|
|
144
|
+
return lambda _: False
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_element(event: Dict[str, Any], path_spec: List[str]) -> Any:
|
|
148
|
+
try:
|
|
149
|
+
for p in path_spec:
|
|
150
|
+
if p not in event:
|
|
151
|
+
return None
|
|
152
|
+
else:
|
|
153
|
+
event = event.get(p, {})
|
|
154
|
+
return event
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(event)
|
|
157
|
+
raise e
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _element_matches_pattern(
|
|
161
|
+
event: Dict[str, Any], path_spec: List[str], pattern: str
|
|
162
|
+
) -> Tuple[bool, bool]:
|
|
163
|
+
import re
|
|
164
|
+
|
|
165
|
+
element = _get_element(event, path_spec)
|
|
166
|
+
if element is None:
|
|
167
|
+
return (False, False)
|
|
168
|
+
else:
|
|
169
|
+
return (True, re.search(pattern, str(element)) is not None)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_entity_urns(events_file: str) -> Set[str]:
|
|
173
|
+
events = load_json_file(events_file)
|
|
174
|
+
assert isinstance(events, list)
|
|
175
|
+
return _get_entity_urns(events)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _get_entity_urns(events_list: List[Dict]) -> Set[str]:
|
|
179
|
+
entity_type = "dataset"
|
|
180
|
+
# mce urns
|
|
181
|
+
mce_urns = {
|
|
182
|
+
_get_element(x, _get_mce_urn_path_spec(entity_type))
|
|
183
|
+
for x in events_list
|
|
184
|
+
if _get_filter(mce=True, entity_type=entity_type)(x)
|
|
185
|
+
}
|
|
186
|
+
mcp_urns = {
|
|
187
|
+
_get_element(x, _get_mcp_urn_path_spec())
|
|
188
|
+
for x in events_list
|
|
189
|
+
if _get_filter(mcp=True, entity_type=entity_type)(x)
|
|
190
|
+
}
|
|
191
|
+
all_urns = mce_urns.union(mcp_urns)
|
|
192
|
+
return all_urns
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def assert_mcp_entity_urn(
|
|
196
|
+
filter: str, entity_type: str, regex_pattern: str, file: str
|
|
197
|
+
) -> int:
|
|
198
|
+
def get_path_spec_for_urn() -> List[str]:
|
|
199
|
+
return [MCPConstants.ENTITY_URN]
|
|
200
|
+
|
|
201
|
+
test_output = load_json_file(file)
|
|
202
|
+
if isinstance(test_output, list):
|
|
203
|
+
path_spec = get_path_spec_for_urn()
|
|
204
|
+
filter_operator = _get_filter(mcp=True, entity_type=entity_type)
|
|
205
|
+
filtered_events = [
|
|
206
|
+
(x, _element_matches_pattern(x, path_spec, regex_pattern))
|
|
207
|
+
for x in test_output
|
|
208
|
+
if filter_operator(x)
|
|
209
|
+
]
|
|
210
|
+
failed_events = [y for y in filtered_events if not y[1][0] or not y[1][1]]
|
|
211
|
+
if failed_events:
|
|
212
|
+
raise Exception("Failed to match events", failed_events)
|
|
213
|
+
return len(filtered_events)
|
|
214
|
+
else:
|
|
215
|
+
raise Exception(
|
|
216
|
+
f"Did not expect the file {file} to not contain a list of items"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _get_mce_urn_path_spec(entity_type: str) -> List[str]:
|
|
221
|
+
if entity_type == EntityType.DATASET:
|
|
222
|
+
return [
|
|
223
|
+
MCEConstants.PROPOSED_SNAPSHOT,
|
|
224
|
+
MCEConstants.DATASET_SNAPSHOT_CLASS,
|
|
225
|
+
"urn",
|
|
226
|
+
]
|
|
227
|
+
raise Exception(f"Not implemented for entity_type: {entity_type}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _get_mcp_urn_path_spec() -> List[str]:
|
|
231
|
+
return [MCPConstants.ENTITY_URN]
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def assert_mce_entity_urn(
|
|
235
|
+
filter: str, entity_type: str, regex_pattern: str, file: str
|
|
236
|
+
) -> int:
|
|
237
|
+
"""Assert that all mce entity urns must match the regex pattern passed in. Return the number of events matched"""
|
|
238
|
+
|
|
239
|
+
test_output = load_json_file(file)
|
|
240
|
+
if isinstance(test_output, list):
|
|
241
|
+
path_spec = _get_mce_urn_path_spec(entity_type)
|
|
242
|
+
filter_operator = _get_filter(mce=True)
|
|
243
|
+
filtered_events = [
|
|
244
|
+
(x, _element_matches_pattern(x, path_spec, regex_pattern))
|
|
245
|
+
for x in test_output
|
|
246
|
+
if filter_operator(x)
|
|
247
|
+
]
|
|
248
|
+
failed_events = [y for y in filtered_events if not y[1][0] or not y[1][1]]
|
|
249
|
+
if failed_events:
|
|
250
|
+
raise Exception(
|
|
251
|
+
"Failed to match events: {json.dumps(failed_events, indent=2)}"
|
|
252
|
+
)
|
|
253
|
+
return len(filtered_events)
|
|
254
|
+
else:
|
|
255
|
+
raise Exception(
|
|
256
|
+
f"Did not expect the file {file} to not contain a list of items"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def assert_for_each_entity(
|
|
261
|
+
entity_type: str,
|
|
262
|
+
aspect_name: str,
|
|
263
|
+
aspect_field_matcher: Dict[str, Any],
|
|
264
|
+
file: str,
|
|
265
|
+
exception_urns: Optional[List[str]] = None,
|
|
266
|
+
) -> int:
|
|
267
|
+
"""Assert that an aspect name with the desired fields exists for each entity urn"""
|
|
268
|
+
if exception_urns is None:
|
|
269
|
+
exception_urns = []
|
|
270
|
+
test_output = load_json_file(file)
|
|
271
|
+
assert isinstance(test_output, list)
|
|
272
|
+
# mce urns
|
|
273
|
+
mce_urns = {
|
|
274
|
+
_get_element(x, _get_mce_urn_path_spec(entity_type))
|
|
275
|
+
for x in test_output
|
|
276
|
+
if _get_filter(mce=True, entity_type=entity_type)(x)
|
|
277
|
+
}
|
|
278
|
+
mcp_urns = {
|
|
279
|
+
_get_element(x, _get_mcp_urn_path_spec())
|
|
280
|
+
for x in test_output
|
|
281
|
+
if _get_filter(mcp=True, entity_type=entity_type)(x)
|
|
282
|
+
}
|
|
283
|
+
all_urns = mce_urns.union(mcp_urns)
|
|
284
|
+
# there should not be any None urns
|
|
285
|
+
assert None not in all_urns
|
|
286
|
+
aspect_map = {urn: None for urn in all_urns}
|
|
287
|
+
# iterate over all mcps
|
|
288
|
+
for o in [
|
|
289
|
+
mcp
|
|
290
|
+
for mcp in test_output
|
|
291
|
+
if _get_filter(mcp=True, entity_type=entity_type)(mcp)
|
|
292
|
+
]:
|
|
293
|
+
if o.get(MCPConstants.ASPECT_NAME) == aspect_name:
|
|
294
|
+
# load the inner aspect payload and assign to this urn
|
|
295
|
+
aspect_map[o[MCPConstants.ENTITY_URN]] = o.get(
|
|
296
|
+
MCPConstants.ASPECT_VALUE, {}
|
|
297
|
+
).get("json")
|
|
298
|
+
|
|
299
|
+
success: List[str] = []
|
|
300
|
+
failures: List[str] = []
|
|
301
|
+
for urn, aspect_val in aspect_map.items():
|
|
302
|
+
if aspect_val is not None:
|
|
303
|
+
for f in aspect_field_matcher:
|
|
304
|
+
assert aspect_field_matcher[f] == _get_element(aspect_val, [f]), (
|
|
305
|
+
f"urn: {urn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}"
|
|
306
|
+
)
|
|
307
|
+
success.append(urn)
|
|
308
|
+
elif urn not in exception_urns:
|
|
309
|
+
print(f"Adding {urn} to failures")
|
|
310
|
+
failures.append(urn)
|
|
311
|
+
|
|
312
|
+
if success:
|
|
313
|
+
print(f"Succeeded on assertion for urns {success}")
|
|
314
|
+
if failures:
|
|
315
|
+
raise AssertionError(
|
|
316
|
+
f"Failed to find aspect_name {aspect_name} for urns {json.dumps(failures, indent=2)}"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return len(success)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def assert_entity_mce_aspect(
|
|
323
|
+
entity_urn: str, aspect: Any, aspect_type: Type, file: str
|
|
324
|
+
) -> int:
|
|
325
|
+
# TODO: Replace with read_metadata_file()
|
|
326
|
+
test_output = load_json_file(file)
|
|
327
|
+
entity_type = Urn.from_string(entity_urn).entity_type
|
|
328
|
+
assert isinstance(test_output, list)
|
|
329
|
+
# mce urns
|
|
330
|
+
mces: List[MetadataChangeEventClass] = [
|
|
331
|
+
MetadataChangeEventClass.from_obj(x)
|
|
332
|
+
for x in test_output
|
|
333
|
+
if _get_filter(mce=True, entity_type=entity_type)(x)
|
|
334
|
+
and _get_element(x, _get_mce_urn_path_spec(entity_type)) == entity_urn
|
|
335
|
+
]
|
|
336
|
+
matches = 0
|
|
337
|
+
for mce in mces:
|
|
338
|
+
for a in mce.proposedSnapshot.aspects:
|
|
339
|
+
if isinstance(a, aspect_type):
|
|
340
|
+
assert a == aspect
|
|
341
|
+
matches = matches + 1
|
|
342
|
+
return matches
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def assert_entity_mcp_aspect(
|
|
346
|
+
entity_urn: str, aspect_field_matcher: Dict[str, Any], aspect_name: str, file: str
|
|
347
|
+
) -> int:
|
|
348
|
+
# TODO: Replace with read_metadata_file()
|
|
349
|
+
test_output = load_json_file(file)
|
|
350
|
+
entity_type = Urn.from_string(entity_urn).entity_type
|
|
351
|
+
assert isinstance(test_output, list)
|
|
352
|
+
# mcps that match entity_urn
|
|
353
|
+
mcps: List[MetadataChangeProposalWrapper] = [
|
|
354
|
+
MetadataChangeProposalWrapper.from_obj_require_wrapper(x)
|
|
355
|
+
for x in test_output
|
|
356
|
+
if _get_filter(mcp=True, entity_type=entity_type)(x)
|
|
357
|
+
and _get_element(x, _get_mcp_urn_path_spec()) == entity_urn
|
|
358
|
+
]
|
|
359
|
+
matches = 0
|
|
360
|
+
for mcp in mcps:
|
|
361
|
+
if mcp.aspectName == aspect_name:
|
|
362
|
+
assert mcp.aspect
|
|
363
|
+
aspect_val = mcp.aspect.to_obj()
|
|
364
|
+
for f in aspect_field_matcher:
|
|
365
|
+
assert aspect_field_matcher[f] == _get_element(aspect_val, [f]), (
|
|
366
|
+
f"urn: {mcp.entityUrn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}"
|
|
367
|
+
)
|
|
368
|
+
matches = matches + 1
|
|
369
|
+
return matches
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def assert_entity_urn_not_like(entity_type: str, regex_pattern: str, file: str) -> int:
|
|
373
|
+
"""Assert that there are no entity urns that match the regex pattern passed in. Returns the total number of events in the file"""
|
|
374
|
+
|
|
375
|
+
# TODO: Refactor common code with assert_entity_urn_like.
|
|
376
|
+
test_output = load_json_file(file)
|
|
377
|
+
assert isinstance(test_output, list)
|
|
378
|
+
# mce urns
|
|
379
|
+
mce_urns = {
|
|
380
|
+
_get_element(x, _get_mce_urn_path_spec(entity_type))
|
|
381
|
+
for x in test_output
|
|
382
|
+
if _get_filter(mce=True, entity_type=entity_type)(x)
|
|
383
|
+
}
|
|
384
|
+
mcp_urns = {
|
|
385
|
+
_get_element(x, _get_mcp_urn_path_spec())
|
|
386
|
+
for x in test_output
|
|
387
|
+
if _get_filter(mcp=True, entity_type=entity_type)(x)
|
|
388
|
+
}
|
|
389
|
+
all_urns = mce_urns.union(mcp_urns)
|
|
390
|
+
print(all_urns)
|
|
391
|
+
matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
|
|
392
|
+
if matched_urns:
|
|
393
|
+
raise AssertionError(f"urns found that match the deny list {matched_urns}")
|
|
394
|
+
return len(test_output)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def assert_entity_urn_like(entity_type: str, regex_pattern: str, file: str) -> int:
|
|
398
|
+
"""Assert that there exist entity urns that match the regex pattern passed in. Returns the total number of events in the file"""
|
|
399
|
+
|
|
400
|
+
test_output = load_json_file(file)
|
|
401
|
+
assert isinstance(test_output, list)
|
|
402
|
+
# mce urns
|
|
403
|
+
mce_urns = {
|
|
404
|
+
_get_element(x, _get_mce_urn_path_spec(entity_type))
|
|
405
|
+
for x in test_output
|
|
406
|
+
if _get_filter(mce=True, entity_type=entity_type)(x)
|
|
407
|
+
}
|
|
408
|
+
mcp_urns = {
|
|
409
|
+
_get_element(x, _get_mcp_urn_path_spec())
|
|
410
|
+
for x in test_output
|
|
411
|
+
if _get_filter(mcp=True, entity_type=entity_type)(x)
|
|
412
|
+
}
|
|
413
|
+
all_urns = mce_urns.union(mcp_urns)
|
|
414
|
+
print(all_urns)
|
|
415
|
+
matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
|
|
416
|
+
if matched_urns:
|
|
417
|
+
return len(matched_urns)
|
|
418
|
+
else:
|
|
419
|
+
raise AssertionError(
|
|
420
|
+
f"No urns found that match the pattern {regex_pattern}. Full list is {all_urns}"
|
|
421
|
+
)
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -2,13 +2,12 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Any, Dict, List, Sequence, Set, Tuple, Union
|
|
5
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import deepdiff.serialization
|
|
8
8
|
import yaml
|
|
9
9
|
from deepdiff import DeepDiff
|
|
10
10
|
from deepdiff.model import DiffLevel
|
|
11
|
-
from deepdiff.operator import BaseOperator
|
|
12
11
|
from typing_extensions import Literal
|
|
13
12
|
|
|
14
13
|
ReportType = Literal[
|
|
@@ -35,6 +34,7 @@ class AspectForDiff:
|
|
|
35
34
|
aspect_name: str
|
|
36
35
|
aspect: Dict[str, Any] = dataclasses.field(hash=False)
|
|
37
36
|
delta_info: "DeltaInfo" = dataclasses.field(hash=False, repr=False)
|
|
37
|
+
headers: Optional[Dict[str, str]] = dataclasses.field(default=None, hash=False)
|
|
38
38
|
|
|
39
39
|
@classmethod
|
|
40
40
|
def create_from_mcp(cls, idx: int, obj: Dict[str, Any]) -> "AspectForDiff":
|
|
@@ -45,6 +45,7 @@ class AspectForDiff:
|
|
|
45
45
|
aspect_name=obj["aspectName"],
|
|
46
46
|
aspect=aspect.get("json", aspect),
|
|
47
47
|
delta_info=DeltaInfo(idx=idx, original=obj),
|
|
48
|
+
headers=obj.get("headers"),
|
|
48
49
|
)
|
|
49
50
|
|
|
50
51
|
def __repr__(self):
|
|
@@ -59,27 +60,12 @@ class AspectForDiff:
|
|
|
59
60
|
|
|
60
61
|
@dataclasses.dataclass
|
|
61
62
|
class DeltaInfo:
|
|
62
|
-
"""Information about an MCP used to construct a diff delta.
|
|
63
|
-
|
|
64
|
-
In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
|
|
65
|
-
"""
|
|
63
|
+
"""Information about an MCP used to construct a diff delta."""
|
|
66
64
|
|
|
67
65
|
idx: int # Location in list of MCEs in golden file
|
|
68
66
|
original: Dict[str, Any] # Original json-serialized MCP
|
|
69
67
|
|
|
70
68
|
|
|
71
|
-
class DeltaInfoOperator(BaseOperator):
|
|
72
|
-
"""Warning: Doesn't seem to be working right now.
|
|
73
|
-
Ignored via an ignore path as an extra layer of defense.
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
def __init__(self):
|
|
77
|
-
super().__init__(types=[DeltaInfo])
|
|
78
|
-
|
|
79
|
-
def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
|
|
80
|
-
return True
|
|
81
|
-
|
|
82
|
-
|
|
83
69
|
AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
|
|
84
70
|
|
|
85
71
|
|
|
@@ -176,7 +162,6 @@ class MCPDiff:
|
|
|
176
162
|
t2=t2,
|
|
177
163
|
exclude_regex_paths=ignore_paths,
|
|
178
164
|
ignore_order=True,
|
|
179
|
-
custom_operators=[DeltaInfoOperator()],
|
|
180
165
|
)
|
|
181
166
|
if diff:
|
|
182
167
|
aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)
|
|
@@ -206,7 +191,7 @@ class MCPDiff:
|
|
|
206
191
|
"""
|
|
207
192
|
aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()]
|
|
208
193
|
for aspect_diff in aspect_diffs:
|
|
209
|
-
for _, old, new in aspect_diff.aspects_changed
|
|
194
|
+
for _, old, new in aspect_diff.aspects_changed:
|
|
210
195
|
golden[old.delta_info.idx] = new.delta_info.original
|
|
211
196
|
|
|
212
197
|
indices_to_remove = set()
|
|
@@ -257,9 +242,12 @@ class MCPDiff:
|
|
|
257
242
|
s.append(serialize_aspect(ga.aspect))
|
|
258
243
|
for (i, old, new), diffs in aspect_diffs.aspects_changed.items():
|
|
259
244
|
s.append(self.report_aspect(old, i, "changed") + ":")
|
|
245
|
+
|
|
246
|
+
print_aspects = False
|
|
260
247
|
for diff_level in diffs:
|
|
261
248
|
s.append(self.report_diff_level(diff_level, i))
|
|
262
|
-
|
|
249
|
+
print_aspects |= self.is_diff_level_on_aspect(diff_level)
|
|
250
|
+
if verbose and print_aspects:
|
|
263
251
|
s.append(f"Old aspect:\n{serialize_aspect(old.aspect)}")
|
|
264
252
|
s.append(f"New aspect:\n{serialize_aspect(new.aspect)}")
|
|
265
253
|
|
|
@@ -288,6 +276,14 @@ class MCPDiff:
|
|
|
288
276
|
f"root[{idx}].", ""
|
|
289
277
|
)
|
|
290
278
|
|
|
279
|
+
@staticmethod
|
|
280
|
+
def is_diff_level_on_aspect(diff: DiffLevel) -> bool:
|
|
281
|
+
skip_print_fields = ["changeType", "headers"]
|
|
282
|
+
try:
|
|
283
|
+
return diff.path(output_format="list")[1] not in skip_print_fields
|
|
284
|
+
except IndexError:
|
|
285
|
+
return True
|
|
286
|
+
|
|
291
287
|
|
|
292
288
|
def serialize_aspect(aspect: Union[AspectForDiff, Dict[str, Any]]) -> str:
|
|
293
289
|
if isinstance(aspect, AspectForDiff): # Unpack aspect
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import Sequence
|
|
3
|
+
|
|
4
|
+
from datahub.sdk.entity import Entity
|
|
5
|
+
from datahub.testing import mce_helpers
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def assert_entity_golden(
|
|
9
|
+
entity: Entity,
|
|
10
|
+
golden_path: pathlib.Path,
|
|
11
|
+
ignore_paths: Sequence[str] = (),
|
|
12
|
+
) -> None:
|
|
13
|
+
mce_helpers.check_goldens_stream(
|
|
14
|
+
outputs=entity.as_mcps(),
|
|
15
|
+
golden_path=golden_path,
|
|
16
|
+
ignore_order=False,
|
|
17
|
+
ignore_paths=ignore_paths,
|
|
18
|
+
)
|