acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.source_common import EnvConfigMixin
|
|
6
|
+
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VertexAIConfig(EnvConfigMixin):
|
|
10
|
+
credential: Optional[GCPCredential] = Field(
|
|
11
|
+
default=None, description="GCP credential information"
|
|
12
|
+
)
|
|
13
|
+
project_id: str = Field(description=("Project ID in Google Cloud Platform"))
|
|
14
|
+
region: str = Field(
|
|
15
|
+
description=("Region of your project in Google Cloud Platform"),
|
|
16
|
+
)
|
|
17
|
+
bucket_uri: Optional[str] = Field(
|
|
18
|
+
default=None,
|
|
19
|
+
description=("Bucket URI used in your project"),
|
|
20
|
+
)
|
|
21
|
+
vertexai_url: Optional[str] = Field(
|
|
22
|
+
default="https://console.cloud.google.com/vertex-ai",
|
|
23
|
+
description=("VertexUI URI"),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def get_credentials(self) -> Optional[Dict[str, str]]:
|
|
27
|
+
if self.credential:
|
|
28
|
+
return self.credential.to_dict(self.project_id)
|
|
29
|
+
return None
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
|
|
3
|
+
from google.cloud.aiplatform.base import VertexAiResourceNoun
|
|
4
|
+
from google.cloud.aiplatform.jobs import _RunnableJob
|
|
5
|
+
from google.cloud.aiplatform.training_jobs import _TrainingJob
|
|
6
|
+
from google.cloud.aiplatform_v1.types import JobState, PipelineState, PipelineTaskDetail
|
|
7
|
+
|
|
8
|
+
from datahub.metadata.schema_classes import RunResultTypeClass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_automl_job_result_type(state: PipelineState) -> Union[str, RunResultTypeClass]:
|
|
12
|
+
state_mapping = {
|
|
13
|
+
PipelineState.PIPELINE_STATE_SUCCEEDED: RunResultTypeClass.SUCCESS,
|
|
14
|
+
PipelineState.PIPELINE_STATE_FAILED: RunResultTypeClass.FAILURE,
|
|
15
|
+
PipelineState.PIPELINE_STATE_CANCELLED: "Cancelled",
|
|
16
|
+
PipelineState.PIPELINE_STATE_PAUSED: "Paused",
|
|
17
|
+
PipelineState.PIPELINE_STATE_QUEUED: "Queued",
|
|
18
|
+
PipelineState.PIPELINE_STATE_RUNNING: "Running",
|
|
19
|
+
PipelineState.PIPELINE_STATE_UNSPECIFIED: "Unspecific",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
return state_mapping.get(state, "UNKNOWN")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_custom_job_result_type(state: JobState) -> Union[str, RunResultTypeClass]:
|
|
26
|
+
state_mapping = {
|
|
27
|
+
JobState.JOB_STATE_SUCCEEDED: RunResultTypeClass.SUCCESS,
|
|
28
|
+
JobState.JOB_STATE_FAILED: RunResultTypeClass.FAILURE,
|
|
29
|
+
JobState.JOB_STATE_CANCELLED: "Cancelled",
|
|
30
|
+
JobState.JOB_STATE_PAUSED: "Paused",
|
|
31
|
+
JobState.JOB_STATE_QUEUED: "Queued",
|
|
32
|
+
JobState.JOB_STATE_RUNNING: "Running",
|
|
33
|
+
JobState.JOB_STATE_CANCELLING: "Cancelling",
|
|
34
|
+
JobState.JOB_STATE_EXPIRED: "Expired",
|
|
35
|
+
JobState.JOB_STATE_UPDATING: "Updating",
|
|
36
|
+
}
|
|
37
|
+
return state_mapping.get(state, "UNKNOWN")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_job_result_status(job: VertexAiResourceNoun) -> Union[str, RunResultTypeClass]:
|
|
41
|
+
if isinstance(job, _TrainingJob) and isinstance(job.state, PipelineState):
|
|
42
|
+
return get_automl_job_result_type(job.state)
|
|
43
|
+
elif isinstance(job, _RunnableJob) and isinstance(job.state, JobState):
|
|
44
|
+
return get_custom_job_result_type(job.state)
|
|
45
|
+
return "UNKNOWN"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_execution_result_status(status: int) -> Union[str, RunResultTypeClass]:
|
|
49
|
+
"""
|
|
50
|
+
State of the execution.
|
|
51
|
+
STATE_UNSPECIFIED = 0
|
|
52
|
+
PENDING = 1
|
|
53
|
+
RUNNING = 2
|
|
54
|
+
SUCCEEDED = 3
|
|
55
|
+
FAILED = 4
|
|
56
|
+
"""
|
|
57
|
+
status_mapping = {
|
|
58
|
+
0: "STATE_UNSPECIFIED",
|
|
59
|
+
1: "PENDING",
|
|
60
|
+
2: "RUNNING",
|
|
61
|
+
3: RunResultTypeClass.SUCCESS,
|
|
62
|
+
4: RunResultTypeClass.FAILURE,
|
|
63
|
+
}
|
|
64
|
+
return status_mapping.get(status, "UNKNOWN")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_pipeline_task_result_status(
|
|
68
|
+
status: Optional[PipelineTaskDetail.State],
|
|
69
|
+
) -> Union[str, RunResultTypeClass]:
|
|
70
|
+
# TODO: DataProcessInstanceRunResultClass fails with status string except for SUCCESS, FAILURE, SKIPPED,
|
|
71
|
+
# which will be fixed in the future
|
|
72
|
+
status_mapping = {
|
|
73
|
+
# PipelineTaskDetail.State.STATE_UNSPECIFIED: "STATE_UNSPECIFIED",
|
|
74
|
+
# PipelineTaskDetail.State.PENDING: "PENDING",
|
|
75
|
+
# PipelineTaskDetail.State.RUNNING: "RUNNING",
|
|
76
|
+
# PipelineTaskDetail.State.CANCEL_PENDING: "CANCEL_PENDING",
|
|
77
|
+
# PipelineTaskDetail.State.CANCELLING: "CANCELLING",
|
|
78
|
+
# PipelineTaskDetail.State.NOT_TRIGGERED: "NOT_TRIGGERED",
|
|
79
|
+
PipelineTaskDetail.State.SUCCEEDED: RunResultTypeClass.SUCCESS,
|
|
80
|
+
PipelineTaskDetail.State.FAILED: RunResultTypeClass.FAILURE,
|
|
81
|
+
PipelineTaskDetail.State.SKIPPED: RunResultTypeClass.SKIPPED,
|
|
82
|
+
}
|
|
83
|
+
if status is None:
|
|
84
|
+
return "UNKNOWN"
|
|
85
|
+
return status_mapping.get(status, "UNKNOWN")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def is_status_for_run_event_class(status: Union[str, RunResultTypeClass]) -> bool:
|
|
89
|
+
return status in [RunResultTypeClass.SUCCESS, RunResultTypeClass.FAILURE]
|
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
from typing import Dict, List, Optional, Union
|
|
3
3
|
from urllib.parse import urlparse
|
|
4
4
|
|
|
5
|
+
import pydantic
|
|
5
6
|
from pydantic import Field, validator
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
|
|
|
121
122
|
)
|
|
122
123
|
return client_secret
|
|
123
124
|
|
|
124
|
-
@
|
|
125
|
+
@pydantic.field_validator("web_service_url", mode="after")
|
|
126
|
+
@classmethod
|
|
125
127
|
def web_service_url_scheme_host_port(cls, val: str) -> str:
|
|
126
128
|
# Tokenize the web url
|
|
127
129
|
url = urlparse(val)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from contextlib import AbstractContextManager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime, timezone
|
|
6
|
+
from enum import Enum
|
|
5
7
|
|
|
6
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
9
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -20,31 +22,68 @@ QUERIES_EXTRACTION = "Queries Extraction"
|
|
|
20
22
|
PROFILING = "Profiling"
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
class IngestionHighStage(Enum):
|
|
26
|
+
"""
|
|
27
|
+
The high-level stages at the framework level
|
|
28
|
+
Team to add more stages as needed
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
PROFILING = "Profiling"
|
|
32
|
+
_UNDEFINED = "Ingestion"
|
|
33
|
+
|
|
34
|
+
|
|
23
35
|
@dataclass
|
|
24
36
|
class IngestionStageReport:
|
|
37
|
+
ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
|
|
38
|
+
default_factory=lambda: defaultdict(float)
|
|
39
|
+
)
|
|
25
40
|
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
26
41
|
|
|
27
|
-
def new_stage(
|
|
28
|
-
|
|
42
|
+
def new_stage(
|
|
43
|
+
self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
|
|
44
|
+
) -> "IngestionStageContext":
|
|
45
|
+
return IngestionStageContext(stage, self, high_stage)
|
|
46
|
+
|
|
47
|
+
def new_high_stage(self, stage: IngestionHighStage) -> "IngestionStageContext":
|
|
48
|
+
return IngestionStageContext("", self, stage)
|
|
29
49
|
|
|
30
50
|
|
|
31
51
|
@dataclass
|
|
32
52
|
class IngestionStageContext(AbstractContextManager):
|
|
33
|
-
def __init__(
|
|
34
|
-
self
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
stage: str,
|
|
56
|
+
report: IngestionStageReport,
|
|
57
|
+
high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED,
|
|
58
|
+
):
|
|
59
|
+
self._high_stage = high_stage
|
|
60
|
+
self._ingestion_stage = (
|
|
61
|
+
f"{stage} at {datetime.now(timezone.utc)}" if stage else ""
|
|
62
|
+
)
|
|
35
63
|
self._timer: PerfTimer = PerfTimer()
|
|
36
64
|
self._report = report
|
|
37
65
|
|
|
38
66
|
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
-
|
|
67
|
+
if self._ingestion_stage:
|
|
68
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
69
|
+
else:
|
|
70
|
+
logger.info(f"High stage started: {self._high_stage.value}")
|
|
40
71
|
self._timer.start()
|
|
41
72
|
return self
|
|
42
73
|
|
|
43
74
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
75
|
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
76
|
+
if self._ingestion_stage:
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
79
|
+
stacklevel=2,
|
|
80
|
+
)
|
|
81
|
+
# Store tuple as string to avoid serialization errors
|
|
82
|
+
key = f"({self._high_stage.value}, {self._ingestion_stage})"
|
|
83
|
+
self._report.ingestion_stage_durations[key] = elapsed
|
|
84
|
+
else:
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
|
|
87
|
+
stacklevel=2,
|
|
88
|
+
)
|
|
89
|
+
self._report.ingestion_high_stage_seconds[self._high_stage] += elapsed
|
|
@@ -54,7 +54,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
54
54
|
data_products_container: Dict[str, DataProductPatchBuilder] = {}
|
|
55
55
|
logger.debug("Generating dataproducts")
|
|
56
56
|
is_container = self.config.is_container
|
|
57
|
-
for entity_urn in self.entity_map
|
|
57
|
+
for entity_urn in self.entity_map:
|
|
58
58
|
data_product_urn = self.config.get_data_product_to_add(entity_urn)
|
|
59
59
|
if data_product_urn:
|
|
60
60
|
if data_product_urn not in data_products:
|
|
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
|
|
|
71
71
|
|
|
72
72
|
server_ownership = graph.get_ownership(entity_urn=urn)
|
|
73
73
|
if server_ownership:
|
|
74
|
-
owners = {
|
|
75
|
-
|
|
74
|
+
owners = {
|
|
75
|
+
(
|
|
76
|
+
owner.owner,
|
|
77
|
+
owner.type,
|
|
78
|
+
owner.typeUrn,
|
|
79
|
+
): owner
|
|
80
|
+
for owner in server_ownership.owners
|
|
81
|
+
}
|
|
82
|
+
owners.update(
|
|
83
|
+
{
|
|
84
|
+
(
|
|
85
|
+
owner.owner,
|
|
86
|
+
owner.type,
|
|
87
|
+
owner.typeUrn,
|
|
88
|
+
): owner
|
|
89
|
+
for owner in mce_ownership.owners
|
|
90
|
+
}
|
|
91
|
+
)
|
|
76
92
|
mce_ownership.owners = list(owners.values())
|
|
77
93
|
|
|
78
94
|
return mce_ownership
|
|
@@ -86,7 +102,7 @@ class AddDatasetOwnership(OwnershipTransformer):
|
|
|
86
102
|
logger.debug("Generating Ownership for containers")
|
|
87
103
|
ownership_container_mapping: Dict[str, List[OwnerClass]] = {}
|
|
88
104
|
for entity_urn, data_ownerships in (
|
|
89
|
-
(urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
|
|
105
|
+
(urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
|
|
90
106
|
):
|
|
91
107
|
if not data_ownerships:
|
|
92
108
|
continue
|
|
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
|
|
|
281
281
|
)
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
284
|
+
if mcp.entityUrn:
|
|
285
|
+
record_metadata = _update_work_unit_id(
|
|
286
|
+
envelope=envelope,
|
|
287
|
+
aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
|
|
288
|
+
urn=mcp.entityUrn,
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
record_metadata = envelope.metadata.copy()
|
|
289
292
|
|
|
290
293
|
yield RecordEnvelope(
|
|
291
294
|
record=mcp,
|
|
@@ -125,7 +125,7 @@ class AddDatasetDomain(DatasetDomainTransformer):
|
|
|
125
125
|
return domain_mcps
|
|
126
126
|
|
|
127
127
|
for entity_urn, domain_to_add in (
|
|
128
|
-
(urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
|
|
128
|
+
(urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
|
|
129
129
|
):
|
|
130
130
|
if not domain_to_add or not domain_to_add.domains:
|
|
131
131
|
continue
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List, Optional, cast
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import (
|
|
6
|
+
TransformerSemanticsConfigModel,
|
|
7
|
+
)
|
|
8
|
+
from datahub.emitter.mce_builder import Aspect
|
|
9
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.transformer.base_transformer import (
|
|
11
|
+
BaseTransformer,
|
|
12
|
+
SingleAspectTransformer,
|
|
13
|
+
)
|
|
14
|
+
from datahub.metadata.schema_classes import (
|
|
15
|
+
BrowsePathEntryClass,
|
|
16
|
+
BrowsePathsV2Class,
|
|
17
|
+
)
|
|
18
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
|
|
22
|
+
path: List[str]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
|
|
26
|
+
ctx: PipelineContext
|
|
27
|
+
config: SetBrowsePathTransformerConfig
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.ctx = ctx
|
|
32
|
+
self.config = config
|
|
33
|
+
|
|
34
|
+
def aspect_name(self) -> str:
|
|
35
|
+
return "browsePathsV2"
|
|
36
|
+
|
|
37
|
+
def entity_types(self) -> List[str]:
|
|
38
|
+
# This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
|
|
39
|
+
return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def create(
|
|
43
|
+
cls, config_dict: dict, ctx: PipelineContext
|
|
44
|
+
) -> "SetBrowsePathTransformer":
|
|
45
|
+
config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
|
|
46
|
+
return cls(config, ctx)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
|
|
50
|
+
template_vars: Dict[str, List[str]] = {}
|
|
51
|
+
model: Dict[str, List[str]] = defaultdict(list)
|
|
52
|
+
for entry in existing_browse_paths.path or []:
|
|
53
|
+
if entry.urn:
|
|
54
|
+
entity_type = guess_entity_type(entry.urn)
|
|
55
|
+
model[entity_type].append(entry.urn)
|
|
56
|
+
|
|
57
|
+
for entity_type, urns in model.items():
|
|
58
|
+
template_vars[f"{entity_type}[*]"] = urns
|
|
59
|
+
for i, urn in enumerate(urns):
|
|
60
|
+
template_vars[f"{entity_type}[{i}]"] = [urn]
|
|
61
|
+
|
|
62
|
+
return template_vars
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def _expand_nodes(
|
|
66
|
+
cls, templates: List[str], template_vars: Dict[str, List[str]]
|
|
67
|
+
) -> BrowsePathsV2Class:
|
|
68
|
+
expanded_nodes: List[str] = []
|
|
69
|
+
for node in templates:
|
|
70
|
+
resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
|
|
71
|
+
expanded_nodes.extend(resolved_nodes)
|
|
72
|
+
|
|
73
|
+
processed_entries: List[BrowsePathEntryClass] = []
|
|
74
|
+
for node in expanded_nodes:
|
|
75
|
+
if not node or node.isspace():
|
|
76
|
+
continue
|
|
77
|
+
processed_entries.append(
|
|
78
|
+
BrowsePathEntryClass(
|
|
79
|
+
id=node, urn=node if node.startswith("urn:") else None
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
return BrowsePathsV2Class(path=processed_entries)
|
|
83
|
+
|
|
84
|
+
def transform_aspect(
|
|
85
|
+
self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
|
|
86
|
+
) -> Optional[Aspect]:
|
|
87
|
+
template_vars: Dict[str, List[str]] = {}
|
|
88
|
+
if aspect is not None:
|
|
89
|
+
assert isinstance(aspect, BrowsePathsV2Class)
|
|
90
|
+
template_vars = self._build_model(aspect)
|
|
91
|
+
new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
|
|
92
|
+
self.config.path, template_vars
|
|
93
|
+
)
|
|
94
|
+
if aspect is not None and not self.config.replace_existing:
|
|
95
|
+
for node in aspect.path:
|
|
96
|
+
new_browse_paths.path.append(node)
|
|
97
|
+
|
|
98
|
+
return cast(Aspect, new_browse_paths)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _resolve_template_to_nodes(
|
|
102
|
+
template_str: str, template_vars: Dict[str, List[str]]
|
|
103
|
+
) -> List[str]:
|
|
104
|
+
# This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
|
|
105
|
+
# proper templating engine, like jinja).
|
|
106
|
+
template_str = template_str.strip()
|
|
107
|
+
var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
|
|
108
|
+
|
|
109
|
+
if not var_pattern:
|
|
110
|
+
return [template_str]
|
|
111
|
+
|
|
112
|
+
return template_vars.get(var_pattern[0], [])
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
|
|
|
3
3
|
|
|
4
4
|
from datahub.api.entities.assertion.assertion import BaseEntityAssertion
|
|
5
5
|
from datahub.ingestion.graph.client import get_default_graph
|
|
6
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
6
7
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
|
|
7
8
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
|
|
8
9
|
from datahub.utilities.urns.urn import Urn
|
|
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
|
|
|
15
16
|
|
|
16
17
|
@lru_cache
|
|
17
18
|
def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
18
|
-
with get_default_graph() as graph:
|
|
19
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
19
20
|
props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
|
|
20
21
|
if props is not None:
|
|
21
22
|
return props.qualifiedName
|
|
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
|
24
25
|
|
|
25
26
|
@lru_cache
|
|
26
27
|
def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
|
|
27
|
-
with get_default_graph() as graph:
|
|
28
|
+
with get_default_graph(ClientMode.INGESTION) as graph:
|
|
28
29
|
schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
|
|
29
30
|
if schema is not None:
|
|
30
31
|
return [
|
|
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
|
|
|
84
84
|
|
|
85
85
|
dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
|
|
86
86
|
dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
|
|
87
|
-
with (
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
with (
|
|
88
|
+
(dmf_definitions_path).open("w") as definitions,
|
|
89
|
+
(dmf_associations_path).open("w") as associations,
|
|
90
|
+
):
|
|
90
91
|
for assertion_spec in assertion_config_spec.assertions:
|
|
91
92
|
result.report.num_processed += 1
|
|
92
93
|
try:
|
datahub/lite/lite_util.py
CHANGED
|
@@ -99,7 +99,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
|
|
|
99
99
|
lite_class = lite_registry.get(lite_type)
|
|
100
100
|
except KeyError as e:
|
|
101
101
|
raise Exception(
|
|
102
|
-
f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping
|
|
102
|
+
f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping]}"
|
|
103
103
|
) from e
|
|
104
104
|
|
|
105
105
|
lite_specific_config = lite_class.get_config_class().parse_obj(
|
|
@@ -127,7 +127,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
|
|
|
127
127
|
return lite
|
|
128
128
|
else:
|
|
129
129
|
raise Exception(
|
|
130
|
-
f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping
|
|
130
|
+
f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping]}"
|
|
131
131
|
)
|
|
132
132
|
else:
|
|
133
133
|
return lite
|