acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,22 @@
|
|
|
1
|
+
from enum import Enum, auto
|
|
1
2
|
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
from datahub.configuration.common import ConfigModel
|
|
5
|
+
from datahub.configuration.env_vars import get_datahub_component
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ClientMode(Enum):
|
|
9
|
+
INGESTION = auto()
|
|
10
|
+
CLI = auto()
|
|
11
|
+
SDK = auto()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
DATAHUB_COMPONENT_ENV: str = get_datahub_component().lower()
|
|
4
15
|
|
|
5
16
|
|
|
6
17
|
class DatahubClientConfig(ConfigModel):
|
|
7
18
|
"""Configuration class for holding connectivity to datahub gms"""
|
|
8
19
|
|
|
9
|
-
# TODO: Having a default for the server doesn't make a ton of sense. This should be handled
|
|
10
|
-
# by callers / the CLI, but the actual client should not have any magic.
|
|
11
20
|
server: str
|
|
12
21
|
token: Optional[str] = None
|
|
13
22
|
timeout_sec: Optional[float] = None
|
|
@@ -17,3 +26,10 @@ class DatahubClientConfig(ConfigModel):
|
|
|
17
26
|
ca_certificate_path: Optional[str] = None
|
|
18
27
|
client_certificate_path: Optional[str] = None
|
|
19
28
|
disable_ssl_verification: bool = False
|
|
29
|
+
openapi_ingestion: Optional[bool] = None
|
|
30
|
+
client_mode: Optional[ClientMode] = None
|
|
31
|
+
datahub_component: Optional[str] = None
|
|
32
|
+
server_config_refresh_interval: Optional[int] = None
|
|
33
|
+
|
|
34
|
+
class Config:
|
|
35
|
+
extra = "ignore"
|
|
@@ -1,30 +1,58 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
|
-
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import TypeAlias
|
|
4
7
|
|
|
5
8
|
from datahub.emitter.mce_builder import (
|
|
6
9
|
make_data_platform_urn,
|
|
7
10
|
make_dataplatform_instance_urn,
|
|
8
11
|
)
|
|
12
|
+
from datahub.errors import SearchFilterWarning
|
|
9
13
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
10
14
|
|
|
11
|
-
RawSearchFilterRule = Dict[str,
|
|
15
|
+
RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
|
|
16
|
+
|
|
17
|
+
# This is a list of OR filters, each of which is a list of AND filters.
|
|
18
|
+
# This can be put directly into the orFilters parameter in GraphQL.
|
|
19
|
+
RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
|
|
20
|
+
|
|
21
|
+
# Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
|
|
22
|
+
FilterOperator: TypeAlias = Literal[
|
|
23
|
+
"CONTAIN",
|
|
24
|
+
"EQUAL",
|
|
25
|
+
"IEQUAL",
|
|
26
|
+
"IN",
|
|
27
|
+
"EXISTS",
|
|
28
|
+
"GREATER_THAN",
|
|
29
|
+
"GREATER_THAN_OR_EQUAL_TO",
|
|
30
|
+
"LESS_THAN",
|
|
31
|
+
"LESS_THAN_OR_EQUAL_TO",
|
|
32
|
+
"START_WITH",
|
|
33
|
+
"END_WITH",
|
|
34
|
+
"DESCENDANTS_INCL",
|
|
35
|
+
"ANCESTORS_INCL",
|
|
36
|
+
"RELATED_INCL",
|
|
37
|
+
]
|
|
12
38
|
|
|
13
39
|
|
|
14
40
|
@dataclasses.dataclass
|
|
15
41
|
class SearchFilterRule:
|
|
16
42
|
field: str
|
|
17
|
-
condition:
|
|
43
|
+
condition: FilterOperator
|
|
18
44
|
values: List[str]
|
|
19
45
|
negated: bool = False
|
|
20
46
|
|
|
21
47
|
def to_raw(self) -> RawSearchFilterRule:
|
|
22
|
-
|
|
48
|
+
rule: RawSearchFilterRule = {
|
|
23
49
|
"field": self.field,
|
|
24
50
|
"condition": self.condition,
|
|
25
51
|
"values": self.values,
|
|
26
|
-
"negated": self.negated,
|
|
27
52
|
}
|
|
53
|
+
if self.negated:
|
|
54
|
+
rule["negated"] = True
|
|
55
|
+
return rule
|
|
28
56
|
|
|
29
57
|
def negate(self) -> "SearchFilterRule":
|
|
30
58
|
return SearchFilterRule(
|
|
@@ -48,15 +76,25 @@ class RemovedStatusFilter(enum.Enum):
|
|
|
48
76
|
"""Search only soft-deleted entities."""
|
|
49
77
|
|
|
50
78
|
|
|
79
|
+
def _validate_or_filter_structure(
|
|
80
|
+
or_filters: List[Dict[str, List[SearchFilterRule]]],
|
|
81
|
+
) -> None:
|
|
82
|
+
for filter_list in or_filters:
|
|
83
|
+
if "and" not in filter_list:
|
|
84
|
+
raise ValueError(f"Invalid or filter: {filter_list}")
|
|
85
|
+
if not isinstance(filter_list["and"], list):
|
|
86
|
+
raise ValueError(f"Invalid or filter: {filter_list}")
|
|
87
|
+
|
|
88
|
+
|
|
51
89
|
def generate_filter(
|
|
52
|
-
platform:
|
|
90
|
+
platform: Union[None, str, List[str]],
|
|
53
91
|
platform_instance: Optional[str],
|
|
54
92
|
env: Optional[str],
|
|
55
|
-
container:
|
|
56
|
-
status: RemovedStatusFilter,
|
|
93
|
+
container: Union[None, str, List[str]],
|
|
94
|
+
status: Optional[RemovedStatusFilter],
|
|
57
95
|
extra_filters: Optional[List[RawSearchFilterRule]],
|
|
58
|
-
extra_or_filters: Optional[
|
|
59
|
-
) ->
|
|
96
|
+
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
97
|
+
) -> RawSearchFilter:
|
|
60
98
|
"""
|
|
61
99
|
Generate a search filter based on the provided parameters.
|
|
62
100
|
:param platform: The platform to filter by.
|
|
@@ -65,8 +103,7 @@ def generate_filter(
|
|
|
65
103
|
:param container: The container to filter by.
|
|
66
104
|
:param status: The status to filter by.
|
|
67
105
|
:param extra_filters: Extra AND filters to apply.
|
|
68
|
-
:param extra_or_filters: Extra OR filters to apply. These are combined with
|
|
69
|
-
the AND filters using an OR at the top level.
|
|
106
|
+
:param extra_or_filters: Extra OR filters to apply. These are combined with the AND filters using an OR at the top level.
|
|
70
107
|
"""
|
|
71
108
|
and_filters: List[RawSearchFilterRule] = []
|
|
72
109
|
|
|
@@ -85,15 +122,16 @@ def generate_filter(
|
|
|
85
122
|
and_filters.append(_get_container_filter(container).to_raw())
|
|
86
123
|
|
|
87
124
|
# Status filter.
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
125
|
+
if status:
|
|
126
|
+
status_filter = _get_status_filter(status)
|
|
127
|
+
if status_filter:
|
|
128
|
+
and_filters.append(status_filter.to_raw())
|
|
91
129
|
|
|
92
130
|
# Extra filters.
|
|
93
131
|
if extra_filters:
|
|
94
132
|
and_filters += extra_filters
|
|
95
133
|
|
|
96
|
-
or_filters:
|
|
134
|
+
or_filters: RawSearchFilter = [{"and": and_filters}]
|
|
97
135
|
|
|
98
136
|
# Env filter
|
|
99
137
|
if env:
|
|
@@ -107,11 +145,27 @@ def generate_filter(
|
|
|
107
145
|
|
|
108
146
|
# Extra OR filters are distributed across the top level and lists.
|
|
109
147
|
if extra_or_filters:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
for extra_or_filter in extra_or_filters
|
|
113
|
-
|
|
114
|
-
|
|
148
|
+
new_or_filters: RawSearchFilter = []
|
|
149
|
+
for and_filter in or_filters:
|
|
150
|
+
for extra_or_filter in extra_or_filters:
|
|
151
|
+
if isinstance(extra_or_filter, dict) and "and" in extra_or_filter:
|
|
152
|
+
new_or_filters.append(
|
|
153
|
+
{"and": and_filter["and"] + extra_or_filter["and"]}
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
# Hack for backwards compatibility.
|
|
157
|
+
# We have some code that erroneously passed a List[RawSearchFilterRule]
|
|
158
|
+
# instead of a List[Dict["and", List[RawSearchFilterRule]]].
|
|
159
|
+
warnings.warn(
|
|
160
|
+
"Passing a List[RawSearchFilterRule] to extra_or_filters is deprecated. "
|
|
161
|
+
"Please pass a List[Dict[str, List[RawSearchFilterRule]]] instead.",
|
|
162
|
+
SearchFilterWarning,
|
|
163
|
+
stacklevel=3,
|
|
164
|
+
)
|
|
165
|
+
new_or_filters.append(
|
|
166
|
+
{"and": and_filter["and"] + [extra_or_filter]} # type: ignore
|
|
167
|
+
)
|
|
168
|
+
or_filters = new_or_filters
|
|
115
169
|
|
|
116
170
|
return or_filters
|
|
117
171
|
|
|
@@ -123,7 +177,7 @@ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
|
|
|
123
177
|
# For most entity types, we look at the origin field.
|
|
124
178
|
{
|
|
125
179
|
"field": "origin",
|
|
126
|
-
"
|
|
180
|
+
"values": [env],
|
|
127
181
|
"condition": "EQUAL",
|
|
128
182
|
},
|
|
129
183
|
# For containers, we look at the customProperties field.
|
|
@@ -131,15 +185,15 @@ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
|
|
|
131
185
|
# we look for the "env" property. Otherwise, we use the "instance" property.
|
|
132
186
|
{
|
|
133
187
|
"field": "customProperties",
|
|
134
|
-
"
|
|
188
|
+
"values": [f"env={env}"],
|
|
135
189
|
},
|
|
136
190
|
{
|
|
137
191
|
"field": "customProperties",
|
|
138
|
-
"
|
|
192
|
+
"values": [f"instance={env}"],
|
|
139
193
|
},
|
|
140
194
|
{
|
|
141
195
|
"field": "env",
|
|
142
|
-
"
|
|
196
|
+
"values": [env],
|
|
143
197
|
},
|
|
144
198
|
# Note that not all entity types have an env (e.g. dashboards / charts).
|
|
145
199
|
# If the env filter is specified, these will be excluded.
|
|
@@ -173,23 +227,31 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
|
|
|
173
227
|
raise ValueError(f"Invalid status filter: {status}")
|
|
174
228
|
|
|
175
229
|
|
|
176
|
-
def _get_container_filter(container: str) -> SearchFilterRule:
|
|
230
|
+
def _get_container_filter(container: Union[str, List[str]]) -> SearchFilterRule:
|
|
231
|
+
if not isinstance(container, list):
|
|
232
|
+
container = [container]
|
|
233
|
+
|
|
177
234
|
# Warn if container is not a fully qualified urn.
|
|
178
235
|
# TODO: Change this once we have a first-class container urn type.
|
|
179
|
-
|
|
180
|
-
|
|
236
|
+
for cont in container:
|
|
237
|
+
if guess_entity_type(cont) != "container":
|
|
238
|
+
raise ValueError(f"Invalid container urn: {cont}")
|
|
181
239
|
|
|
182
240
|
return SearchFilterRule(
|
|
183
241
|
field="browsePathV2",
|
|
184
|
-
values=
|
|
242
|
+
values=container,
|
|
185
243
|
condition="CONTAIN",
|
|
186
244
|
)
|
|
187
245
|
|
|
188
246
|
|
|
189
247
|
def _get_platform_instance_filter(
|
|
190
|
-
platform:
|
|
248
|
+
platform: Union[None, str, List[str]], platform_instance: str
|
|
191
249
|
) -> SearchFilterRule:
|
|
192
250
|
if platform:
|
|
251
|
+
if isinstance(platform, list):
|
|
252
|
+
raise ValueError(
|
|
253
|
+
"Platform instance filter cannot be combined with a multi-value platform filter."
|
|
254
|
+
)
|
|
193
255
|
# Massage the platform instance into a fully qualified urn, if necessary.
|
|
194
256
|
platform_instance = make_dataplatform_instance_urn(platform, platform_instance)
|
|
195
257
|
|
|
@@ -205,9 +267,11 @@ def _get_platform_instance_filter(
|
|
|
205
267
|
)
|
|
206
268
|
|
|
207
269
|
|
|
208
|
-
def _get_platform_filter(platform: str) -> SearchFilterRule:
|
|
270
|
+
def _get_platform_filter(platform: Union[str, List[str]]) -> SearchFilterRule:
|
|
271
|
+
if not isinstance(platform, list):
|
|
272
|
+
platform = [platform]
|
|
209
273
|
return SearchFilterRule(
|
|
210
274
|
field="platform.keyword",
|
|
211
275
|
condition="EQUAL",
|
|
212
|
-
values=[make_data_platform_urn(platform
|
|
276
|
+
values=[make_data_platform_urn(plt) for plt in platform],
|
|
213
277
|
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import datahub.metadata.urns as urns
|
|
5
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
6
|
+
|
|
7
|
+
_url_prefixes = {
|
|
8
|
+
# Atypical mappings.
|
|
9
|
+
urns.DataJobUrn.ENTITY_TYPE: "tasks",
|
|
10
|
+
urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
|
|
11
|
+
urns.CorpUserUrn.ENTITY_TYPE: "user",
|
|
12
|
+
urns.CorpGroupUrn.ENTITY_TYPE: "group",
|
|
13
|
+
# Normal mappings - matches the entity type.
|
|
14
|
+
urns.ChartUrn.ENTITY_TYPE: "chart",
|
|
15
|
+
urns.ContainerUrn.ENTITY_TYPE: "container",
|
|
16
|
+
urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
|
|
17
|
+
urns.DatasetUrn.ENTITY_TYPE: "dataset",
|
|
18
|
+
urns.DashboardUrn.ENTITY_TYPE: "dashboard",
|
|
19
|
+
urns.DomainUrn.ENTITY_TYPE: "domain",
|
|
20
|
+
urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
|
|
21
|
+
urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
|
|
22
|
+
urns.TagUrn.ENTITY_TYPE: "tag",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def make_url_for_urn(
|
|
27
|
+
frontend_base_url: str,
|
|
28
|
+
entity_urn: str,
|
|
29
|
+
*,
|
|
30
|
+
tab: Optional[str] = None,
|
|
31
|
+
) -> str:
|
|
32
|
+
"""Build the public-facing URL for an entity urn.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
frontend_url: The public-facing base url of the frontend.
|
|
36
|
+
entity_urn: The urn of the entity to get the url for.
|
|
37
|
+
tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The public-facing url for the entity.
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
|
|
44
|
+
'https://demo.datahub.com/container/urn%3Ali%3Acontainer%3Ab41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
|
|
45
|
+
>>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
|
|
46
|
+
'https://demo.datahub.com/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Asnowflake%2Clong_tail_companions.adoption.actuating%2CPROD%29/'
|
|
47
|
+
"""
|
|
48
|
+
entity_type = guess_entity_type(entity_urn)
|
|
49
|
+
encoded_entity_urn = urllib.parse.quote(entity_urn, safe="")
|
|
50
|
+
|
|
51
|
+
url_prefix = _url_prefixes.get(entity_type, entity_type)
|
|
52
|
+
url = f"{frontend_base_url}/{url_prefix}/{encoded_entity_urn}/"
|
|
53
|
+
if tab:
|
|
54
|
+
url += f"{tab}"
|
|
55
|
+
return url
|
|
@@ -13,6 +13,7 @@ from datahub.configuration.common import (
|
|
|
13
13
|
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
14
14
|
from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn
|
|
15
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
|
+
from datahub.emitter.rest_emitter import EmitMode
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
|
17
18
|
from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
|
|
18
19
|
from datahub.ingestion.api.sink import NoopWriteCallback, Sink
|
|
@@ -111,6 +112,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
111
112
|
def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> None:
|
|
112
113
|
assert ctx.pipeline_config is not None
|
|
113
114
|
|
|
115
|
+
self.ctx = ctx
|
|
114
116
|
self.sink: Sink = sink
|
|
115
117
|
self.report_recipe = report_recipe
|
|
116
118
|
ingestion_source_key = self.generate_unique_key(ctx.pipeline_config)
|
|
@@ -191,18 +193,25 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
191
193
|
)
|
|
192
194
|
return json.dumps(converted_recipe)
|
|
193
195
|
|
|
194
|
-
def _emit_aspect(
|
|
195
|
-
self
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
),
|
|
201
|
-
metadata={},
|
|
202
|
-
),
|
|
203
|
-
NoopWriteCallback(),
|
|
196
|
+
def _emit_aspect(
|
|
197
|
+
self, entity_urn: Urn, aspect_value: _Aspect, try_sync: bool = False
|
|
198
|
+
) -> None:
|
|
199
|
+
mcp = MetadataChangeProposalWrapper(
|
|
200
|
+
entityUrn=str(entity_urn),
|
|
201
|
+
aspect=aspect_value,
|
|
204
202
|
)
|
|
205
203
|
|
|
204
|
+
if try_sync and self.ctx.graph:
|
|
205
|
+
self.ctx.graph.emit_mcp(mcp, emit_mode=EmitMode.SYNC_PRIMARY)
|
|
206
|
+
else:
|
|
207
|
+
self.sink.write_record_async(
|
|
208
|
+
RecordEnvelope(
|
|
209
|
+
record=mcp,
|
|
210
|
+
metadata={},
|
|
211
|
+
),
|
|
212
|
+
NoopWriteCallback(),
|
|
213
|
+
)
|
|
214
|
+
|
|
206
215
|
def on_start(self, ctx: PipelineContext) -> None:
|
|
207
216
|
assert ctx.pipeline_config is not None
|
|
208
217
|
# Construct the dataHubExecutionRequestInput aspect
|
|
@@ -223,6 +232,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
223
232
|
self._emit_aspect(
|
|
224
233
|
entity_urn=self.execution_request_input_urn,
|
|
225
234
|
aspect_value=execution_input_aspect,
|
|
235
|
+
try_sync=True,
|
|
226
236
|
)
|
|
227
237
|
|
|
228
238
|
def on_completion(
|
|
@@ -258,4 +268,4 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
258
268
|
entity_urn=self.execution_request_input_urn,
|
|
259
269
|
aspect_value=execution_result_aspect,
|
|
260
270
|
)
|
|
261
|
-
|
|
271
|
+
# Note: sink.close() is handled by the pipeline's context manager
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
|
|
|
31
31
|
from datahub.ingestion.api.transform import Transformer
|
|
32
32
|
from datahub.ingestion.extractor.extractor_registry import extractor_registry
|
|
33
33
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
34
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
34
35
|
from datahub.ingestion.reporting.reporting_provider_registry import (
|
|
35
36
|
reporting_provider_registry,
|
|
36
37
|
)
|
|
@@ -39,13 +40,14 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
|
|
|
39
40
|
from datahub.ingestion.sink.datahub_rest import DatahubRestSink
|
|
40
41
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
41
42
|
from datahub.ingestion.source.source_registry import source_registry
|
|
42
|
-
from datahub.ingestion.transformer.system_metadata_transformer import (
|
|
43
|
-
SystemMetadataTransformer,
|
|
44
|
-
)
|
|
45
43
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
46
44
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
47
45
|
from datahub.telemetry import stats
|
|
48
46
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
47
|
+
from datahub.upgrade.upgrade import (
|
|
48
|
+
is_server_default_cli_ahead,
|
|
49
|
+
retrieve_version_stats,
|
|
50
|
+
)
|
|
49
51
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
50
52
|
from datahub.utilities.global_warning_util import (
|
|
51
53
|
clear_global_warnings,
|
|
@@ -139,9 +141,8 @@ class CliReport(Report):
|
|
|
139
141
|
|
|
140
142
|
|
|
141
143
|
def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
|
|
142
|
-
graph = get_default_graph()
|
|
144
|
+
graph = get_default_graph(ClientMode.INGESTION)
|
|
143
145
|
sink_config = graph._make_rest_sink_config()
|
|
144
|
-
|
|
145
146
|
return DatahubRestSink(ctx, sink_config)
|
|
146
147
|
|
|
147
148
|
|
|
@@ -174,10 +175,14 @@ class Pipeline:
|
|
|
174
175
|
self.last_time_printed = int(time.time())
|
|
175
176
|
self.cli_report = CliReport()
|
|
176
177
|
|
|
177
|
-
with
|
|
178
|
+
with (
|
|
179
|
+
contextlib.ExitStack() as exit_stack,
|
|
180
|
+
contextlib.ExitStack() as inner_exit_stack,
|
|
181
|
+
):
|
|
178
182
|
self.graph: Optional[DataHubGraph] = None
|
|
179
183
|
with _add_init_error_context("connect to DataHub"):
|
|
180
184
|
if self.config.datahub_api:
|
|
185
|
+
self.config.datahub_api.client_mode = ClientMode.INGESTION
|
|
181
186
|
self.graph = exit_stack.enter_context(
|
|
182
187
|
DataHubGraph(self.config.datahub_api)
|
|
183
188
|
)
|
|
@@ -260,6 +265,11 @@ class Pipeline:
|
|
|
260
265
|
with _add_init_error_context("configure transformers"):
|
|
261
266
|
self._configure_transforms()
|
|
262
267
|
|
|
268
|
+
# Register completion callback with sink to handle final reporting
|
|
269
|
+
self.sink.register_pre_shutdown_callback(
|
|
270
|
+
self._notify_reporters_on_ingestion_completion
|
|
271
|
+
)
|
|
272
|
+
|
|
263
273
|
# If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
|
|
264
274
|
# We need to use an exit stack so that if we have an exception during initialization,
|
|
265
275
|
# things that were already initialized are still cleaned up.
|
|
@@ -286,9 +296,6 @@ class Pipeline:
|
|
|
286
296
|
f"Transformer type:{transformer_type},{transformer_class} configured"
|
|
287
297
|
)
|
|
288
298
|
|
|
289
|
-
# Add the system metadata transformer at the end of the list.
|
|
290
|
-
self.transformers.append(SystemMetadataTransformer(self.ctx))
|
|
291
|
-
|
|
292
299
|
def _configure_reporting(self, report_to: Optional[str]) -> None:
|
|
293
300
|
if self.dry_run:
|
|
294
301
|
# In dry run mode, we don't want to report anything.
|
|
@@ -342,8 +349,48 @@ class Pipeline:
|
|
|
342
349
|
for reporter in self.reporters:
|
|
343
350
|
try:
|
|
344
351
|
reporter.on_start(ctx=self.ctx)
|
|
345
|
-
except Exception
|
|
346
|
-
logger.warning("Reporting failed on start", exc_info=
|
|
352
|
+
except Exception:
|
|
353
|
+
logger.warning("Reporting failed on start", exc_info=True)
|
|
354
|
+
|
|
355
|
+
def _warn_old_cli_version(self) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Check if the server default CLI version is ahead of the CLI version being used.
|
|
358
|
+
If so, add a warning to the report.
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
|
|
363
|
+
except RuntimeError as e:
|
|
364
|
+
# Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
|
|
365
|
+
if "no current event loop" in str(e):
|
|
366
|
+
logger.debug("Skipping version check - no event loop available")
|
|
367
|
+
return
|
|
368
|
+
raise
|
|
369
|
+
|
|
370
|
+
if not version_stats or not self.graph:
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
if is_server_default_cli_ahead(version_stats):
|
|
374
|
+
server_default_version = (
|
|
375
|
+
version_stats.server.current_server_default_cli_version.version
|
|
376
|
+
if version_stats.server.current_server_default_cli_version
|
|
377
|
+
else None
|
|
378
|
+
)
|
|
379
|
+
current_version = version_stats.client.current.version
|
|
380
|
+
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"""
|
|
383
|
+
client_version: {current_version}
|
|
384
|
+
server_default_version: {server_default_version}
|
|
385
|
+
server_default_cli_ahead: True
|
|
386
|
+
"""
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
self.source.get_report().warning(
|
|
390
|
+
title="Server default CLI version is ahead of CLI version",
|
|
391
|
+
message="Please upgrade the CLI version being used",
|
|
392
|
+
context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
|
|
393
|
+
)
|
|
347
394
|
|
|
348
395
|
def _notify_reporters_on_ingestion_completion(self) -> None:
|
|
349
396
|
for reporter in self.reporters:
|
|
@@ -365,8 +412,8 @@ class Pipeline:
|
|
|
365
412
|
report=self._get_structured_report(),
|
|
366
413
|
ctx=self.ctx,
|
|
367
414
|
)
|
|
368
|
-
except Exception
|
|
369
|
-
logger.warning("Reporting failed on completion", exc_info=
|
|
415
|
+
except Exception:
|
|
416
|
+
logger.warning("Reporting failed on completion", exc_info=True)
|
|
370
417
|
|
|
371
418
|
@classmethod
|
|
372
419
|
def create(
|
|
@@ -400,7 +447,20 @@ class Pipeline:
|
|
|
400
447
|
return True
|
|
401
448
|
return False
|
|
402
449
|
|
|
450
|
+
def _set_platform(self) -> None:
|
|
451
|
+
platform = self.source.infer_platform()
|
|
452
|
+
if platform:
|
|
453
|
+
self.source.get_report().set_platform(platform)
|
|
454
|
+
else:
|
|
455
|
+
self.source.get_report().warning(
|
|
456
|
+
message="Platform not found",
|
|
457
|
+
title="Platform not found",
|
|
458
|
+
context="Platform not found",
|
|
459
|
+
)
|
|
460
|
+
|
|
403
461
|
def run(self) -> None:
|
|
462
|
+
self._set_platform()
|
|
463
|
+
self._warn_old_cli_version()
|
|
404
464
|
with self.exit_stack, self.inner_exit_stack:
|
|
405
465
|
if self.config.flags.generate_memory_profiles:
|
|
406
466
|
import memray
|
|
@@ -466,10 +526,10 @@ class Pipeline:
|
|
|
466
526
|
|
|
467
527
|
except (RuntimeError, SystemExit):
|
|
468
528
|
raise
|
|
469
|
-
except Exception
|
|
529
|
+
except Exception:
|
|
470
530
|
logger.error(
|
|
471
531
|
"Failed to process some records. Continuing.",
|
|
472
|
-
exc_info=
|
|
532
|
+
exc_info=True,
|
|
473
533
|
)
|
|
474
534
|
# TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
|
|
475
535
|
|
|
@@ -498,9 +558,9 @@ class Pipeline:
|
|
|
498
558
|
|
|
499
559
|
self.process_commits()
|
|
500
560
|
self.final_status = PipelineStatus.COMPLETED
|
|
501
|
-
except (SystemExit, KeyboardInterrupt)
|
|
561
|
+
except (SystemExit, KeyboardInterrupt):
|
|
502
562
|
self.final_status = PipelineStatus.CANCELLED
|
|
503
|
-
logger.error("Caught error", exc_info=
|
|
563
|
+
logger.error("Caught error", exc_info=True)
|
|
504
564
|
raise
|
|
505
565
|
except Exception as exc:
|
|
506
566
|
self.final_status = PipelineStatus.ERROR
|
|
@@ -508,8 +568,6 @@ class Pipeline:
|
|
|
508
568
|
finally:
|
|
509
569
|
clear_global_warnings()
|
|
510
570
|
|
|
511
|
-
self._notify_reporters_on_ingestion_completion()
|
|
512
|
-
|
|
513
571
|
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
|
514
572
|
"""
|
|
515
573
|
Transforms the given sequence of records by passing the records through the transformers
|
|
@@ -561,18 +619,20 @@ class Pipeline:
|
|
|
561
619
|
def raise_from_status(self, raise_warnings: bool = False) -> None:
|
|
562
620
|
if self.source.get_report().failures:
|
|
563
621
|
raise PipelineExecutionError(
|
|
564
|
-
"Source reported errors", self.source.get_report()
|
|
622
|
+
"Source reported errors", self.source.get_report().failures
|
|
565
623
|
)
|
|
566
624
|
if self.sink.get_report().failures:
|
|
567
|
-
raise PipelineExecutionError(
|
|
625
|
+
raise PipelineExecutionError(
|
|
626
|
+
"Sink reported errors", self.sink.get_report().failures
|
|
627
|
+
)
|
|
568
628
|
if raise_warnings:
|
|
569
629
|
if self.source.get_report().warnings:
|
|
570
630
|
raise PipelineExecutionError(
|
|
571
|
-
"Source reported warnings", self.source.get_report()
|
|
631
|
+
"Source reported warnings", self.source.get_report().warnings
|
|
572
632
|
)
|
|
573
633
|
if self.sink.get_report().warnings:
|
|
574
634
|
raise PipelineExecutionError(
|
|
575
|
-
"Sink reported warnings", self.sink.get_report()
|
|
635
|
+
"Sink reported warnings", self.sink.get_report().warnings
|
|
576
636
|
)
|
|
577
637
|
|
|
578
638
|
def log_ingestion_stats(self) -> None:
|
|
@@ -581,15 +641,22 @@ class Pipeline:
|
|
|
581
641
|
sink_failures = len(self.sink.get_report().failures)
|
|
582
642
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
583
643
|
global_warnings = len(get_global_warnings())
|
|
644
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
645
|
+
source_aspects_by_subtype = (
|
|
646
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
647
|
+
)
|
|
584
648
|
|
|
585
649
|
telemetry_instance.ping(
|
|
586
650
|
"ingest_stats",
|
|
587
651
|
{
|
|
588
652
|
"source_type": self.source_type,
|
|
653
|
+
"source_aspects": source_aspects,
|
|
654
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
589
655
|
"sink_type": self.sink_type,
|
|
590
656
|
"transformer_types": [
|
|
591
657
|
transformer.type for transformer in self.config.transformers or []
|
|
592
658
|
],
|
|
659
|
+
"extractor_type": self.config.source.extractor,
|
|
593
660
|
"records_written": stats.discretize(
|
|
594
661
|
self.sink.get_report().total_records_written
|
|
595
662
|
),
|
|
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
10
|
-
from datahub.ingestion.graph.
|
|
9
|
+
from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
|
|
10
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
|
|
|
85
85
|
source: SourceConfig
|
|
86
86
|
sink: Optional[DynamicTypedConfig] = None
|
|
87
87
|
transformers: Optional[List[DynamicTypedConfig]] = None
|
|
88
|
-
flags: FlagsConfig =
|
|
88
|
+
flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
|
|
89
89
|
reporting: List[ReporterConfig] = []
|
|
90
90
|
run_id: str = DEFAULT_RUN_ID
|
|
91
91
|
datahub_api: Optional[DatahubClientConfig] = None
|