acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/source.py
CHANGED
|
@@ -2,7 +2,6 @@ import contextlib
|
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from enum import Enum
|
|
8
7
|
from functools import partial
|
|
@@ -15,7 +14,6 @@ from typing import (
|
|
|
15
14
|
List,
|
|
16
15
|
Optional,
|
|
17
16
|
Sequence,
|
|
18
|
-
Set,
|
|
19
17
|
Type,
|
|
20
18
|
TypeVar,
|
|
21
19
|
Union,
|
|
@@ -27,28 +25,39 @@ from typing_extensions import LiteralString, Self
|
|
|
27
25
|
|
|
28
26
|
from datahub.configuration.common import ConfigModel
|
|
29
27
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
30
|
-
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
31
28
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
29
|
auto_patch_last_modified,
|
|
33
30
|
)
|
|
34
31
|
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
35
32
|
EnsureAspectSizeProcessor,
|
|
36
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_validate_input_fields import (
|
|
35
|
+
ValidateInputFieldsProcessor,
|
|
36
|
+
)
|
|
37
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
38
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
39
|
-
from datahub.ingestion.api.report import Report
|
|
39
|
+
from datahub.ingestion.api.report import ExamplesReport, Report
|
|
40
40
|
from datahub.ingestion.api.source_helpers import (
|
|
41
|
+
AutoSystemMetadata,
|
|
41
42
|
auto_browse_path_v2,
|
|
42
43
|
auto_fix_duplicate_schema_field_paths,
|
|
43
44
|
auto_fix_empty_field_paths,
|
|
44
45
|
auto_lowercase_urns,
|
|
45
46
|
auto_materialize_referenced_tags_terms,
|
|
46
47
|
auto_status_aspect,
|
|
48
|
+
auto_workunit,
|
|
47
49
|
auto_workunit_reporter,
|
|
48
50
|
)
|
|
51
|
+
from datahub.ingestion.api.source_protocols import (
|
|
52
|
+
MetadataWorkUnitIterable,
|
|
53
|
+
ProfilingCapable,
|
|
54
|
+
)
|
|
49
55
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
50
|
-
from datahub.
|
|
51
|
-
|
|
56
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
57
|
+
IngestionHighStage,
|
|
58
|
+
IngestionStageReport,
|
|
59
|
+
)
|
|
60
|
+
from datahub.telemetry import stats
|
|
52
61
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
53
62
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
54
63
|
|
|
@@ -72,6 +81,7 @@ class SourceCapability(Enum):
|
|
|
72
81
|
SCHEMA_METADATA = "Schema Metadata"
|
|
73
82
|
CONTAINERS = "Asset Containers"
|
|
74
83
|
CLASSIFICATION = "Classification"
|
|
84
|
+
TEST_CONNECTION = "Test Connection"
|
|
75
85
|
|
|
76
86
|
|
|
77
87
|
class StructuredLogLevel(Enum):
|
|
@@ -80,11 +90,24 @@ class StructuredLogLevel(Enum):
|
|
|
80
90
|
ERROR = logging.ERROR
|
|
81
91
|
|
|
82
92
|
|
|
93
|
+
class StructuredLogCategory(Enum):
|
|
94
|
+
"""
|
|
95
|
+
This is used to categorise the errors mainly based on the biggest impact area
|
|
96
|
+
This is to be used to help in self-serve understand the impact of any log entry
|
|
97
|
+
More enums to be added as logs are updated to be self-serve
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
LINEAGE = "LINEAGE"
|
|
101
|
+
USAGE = "USAGE"
|
|
102
|
+
PROFILING = "PROFILING"
|
|
103
|
+
|
|
104
|
+
|
|
83
105
|
@dataclass
|
|
84
106
|
class StructuredLogEntry(Report):
|
|
85
107
|
title: Optional[str]
|
|
86
108
|
message: str
|
|
87
109
|
context: LossyList[str]
|
|
110
|
+
log_category: Optional[StructuredLogCategory] = None
|
|
88
111
|
|
|
89
112
|
|
|
90
113
|
@dataclass
|
|
@@ -107,9 +130,10 @@ class StructuredLogs(Report):
|
|
|
107
130
|
exc: Optional[BaseException] = None,
|
|
108
131
|
log: bool = False,
|
|
109
132
|
stacklevel: int = 1,
|
|
133
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
110
134
|
) -> None:
|
|
111
135
|
"""
|
|
112
|
-
Report a user-facing
|
|
136
|
+
Report a user-facing log for the ingestion run.
|
|
113
137
|
|
|
114
138
|
Args:
|
|
115
139
|
level: The level of the log entry.
|
|
@@ -117,6 +141,9 @@ class StructuredLogs(Report):
|
|
|
117
141
|
title: The category / heading to present on for this message in the UI.
|
|
118
142
|
context: Additional context (e.g. where, how) for the log entry.
|
|
119
143
|
exc: The exception associated with the event. We'll show the stack trace when in debug mode.
|
|
144
|
+
log_category: The type of the log entry. This is used to categorise the log entry.
|
|
145
|
+
log: Whether to log the entry to the console.
|
|
146
|
+
stacklevel: The stack level to use for the log entry.
|
|
120
147
|
"""
|
|
121
148
|
|
|
122
149
|
# One for this method, and one for the containing report_* call.
|
|
@@ -159,6 +186,7 @@ class StructuredLogs(Report):
|
|
|
159
186
|
title=title,
|
|
160
187
|
message=message,
|
|
161
188
|
context=context_list,
|
|
189
|
+
log_category=log_category,
|
|
162
190
|
)
|
|
163
191
|
else:
|
|
164
192
|
if context is not None:
|
|
@@ -186,19 +214,11 @@ class StructuredLogs(Report):
|
|
|
186
214
|
|
|
187
215
|
|
|
188
216
|
@dataclass
|
|
189
|
-
class SourceReport(
|
|
217
|
+
class SourceReport(ExamplesReport, IngestionStageReport):
|
|
190
218
|
event_not_produced_warn: bool = True
|
|
191
219
|
events_produced: int = 0
|
|
192
220
|
events_produced_per_sec: int = 0
|
|
193
|
-
|
|
194
|
-
_urns_seen: Set[str] = field(default_factory=set)
|
|
195
|
-
entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
|
|
196
|
-
aspects: Dict[str, Dict[str, int]] = field(
|
|
197
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
198
|
-
)
|
|
199
|
-
aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
|
|
200
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
|
|
201
|
-
)
|
|
221
|
+
num_input_fields_filtered: int = 0
|
|
202
222
|
|
|
203
223
|
_structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
|
|
204
224
|
|
|
@@ -216,33 +236,10 @@ class SourceReport(Report):
|
|
|
216
236
|
|
|
217
237
|
def report_workunit(self, wu: WorkUnit) -> None:
|
|
218
238
|
self.events_produced += 1
|
|
239
|
+
if not isinstance(wu, MetadataWorkUnit):
|
|
240
|
+
return
|
|
219
241
|
|
|
220
|
-
|
|
221
|
-
urn = wu.get_urn()
|
|
222
|
-
|
|
223
|
-
# Specialized entity reporting.
|
|
224
|
-
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
225
|
-
mcps = [wu.metadata]
|
|
226
|
-
else:
|
|
227
|
-
mcps = list(mcps_from_mce(wu.metadata))
|
|
228
|
-
|
|
229
|
-
for mcp in mcps:
|
|
230
|
-
entityType = mcp.entityType
|
|
231
|
-
aspectName = mcp.aspectName
|
|
232
|
-
|
|
233
|
-
if urn not in self._urns_seen:
|
|
234
|
-
self._urns_seen.add(urn)
|
|
235
|
-
self.entities[entityType].append(urn)
|
|
236
|
-
|
|
237
|
-
if aspectName is not None: # usually true
|
|
238
|
-
self.aspects[entityType][aspectName] += 1
|
|
239
|
-
self.aspect_urn_samples[entityType][aspectName].append(urn)
|
|
240
|
-
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
241
|
-
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
242
|
-
if upstream_lineage.fineGrainedLineages:
|
|
243
|
-
self.aspect_urn_samples[entityType][
|
|
244
|
-
"fineGrainedLineages"
|
|
245
|
-
].append(urn)
|
|
242
|
+
super()._store_workunit_data(wu)
|
|
246
243
|
|
|
247
244
|
def report_warning(
|
|
248
245
|
self,
|
|
@@ -250,9 +247,19 @@ class SourceReport(Report):
|
|
|
250
247
|
context: Optional[str] = None,
|
|
251
248
|
title: Optional[LiteralString] = None,
|
|
252
249
|
exc: Optional[BaseException] = None,
|
|
250
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
253
251
|
) -> None:
|
|
252
|
+
"""
|
|
253
|
+
See docs of StructuredLogs.report_log for details of args
|
|
254
|
+
"""
|
|
254
255
|
self._structured_logs.report_log(
|
|
255
|
-
StructuredLogLevel.WARN,
|
|
256
|
+
StructuredLogLevel.WARN,
|
|
257
|
+
message,
|
|
258
|
+
title,
|
|
259
|
+
context,
|
|
260
|
+
exc,
|
|
261
|
+
log=False,
|
|
262
|
+
log_category=log_category,
|
|
256
263
|
)
|
|
257
264
|
|
|
258
265
|
def warning(
|
|
@@ -261,9 +268,20 @@ class SourceReport(Report):
|
|
|
261
268
|
context: Optional[str] = None,
|
|
262
269
|
title: Optional[LiteralString] = None,
|
|
263
270
|
exc: Optional[BaseException] = None,
|
|
271
|
+
log: bool = True,
|
|
272
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
264
273
|
) -> None:
|
|
274
|
+
"""
|
|
275
|
+
See docs of StructuredLogs.report_log for details of args
|
|
276
|
+
"""
|
|
265
277
|
self._structured_logs.report_log(
|
|
266
|
-
StructuredLogLevel.WARN,
|
|
278
|
+
StructuredLogLevel.WARN,
|
|
279
|
+
message,
|
|
280
|
+
title,
|
|
281
|
+
context,
|
|
282
|
+
exc,
|
|
283
|
+
log=log,
|
|
284
|
+
log_category=log_category,
|
|
267
285
|
)
|
|
268
286
|
|
|
269
287
|
def report_failure(
|
|
@@ -273,9 +291,19 @@ class SourceReport(Report):
|
|
|
273
291
|
title: Optional[LiteralString] = None,
|
|
274
292
|
exc: Optional[BaseException] = None,
|
|
275
293
|
log: bool = True,
|
|
294
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
276
295
|
) -> None:
|
|
296
|
+
"""
|
|
297
|
+
See docs of StructuredLogs.report_log for details of args
|
|
298
|
+
"""
|
|
277
299
|
self._structured_logs.report_log(
|
|
278
|
-
StructuredLogLevel.ERROR,
|
|
300
|
+
StructuredLogLevel.ERROR,
|
|
301
|
+
message,
|
|
302
|
+
title,
|
|
303
|
+
context,
|
|
304
|
+
exc,
|
|
305
|
+
log=log,
|
|
306
|
+
log_category=log_category,
|
|
279
307
|
)
|
|
280
308
|
|
|
281
309
|
def failure(
|
|
@@ -285,9 +313,19 @@ class SourceReport(Report):
|
|
|
285
313
|
title: Optional[LiteralString] = None,
|
|
286
314
|
exc: Optional[BaseException] = None,
|
|
287
315
|
log: bool = True,
|
|
316
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
288
317
|
) -> None:
|
|
318
|
+
"""
|
|
319
|
+
See docs of StructuredLogs.report_log for details of args
|
|
320
|
+
"""
|
|
289
321
|
self._structured_logs.report_log(
|
|
290
|
-
StructuredLogLevel.ERROR,
|
|
322
|
+
StructuredLogLevel.ERROR,
|
|
323
|
+
message,
|
|
324
|
+
title,
|
|
325
|
+
context,
|
|
326
|
+
exc,
|
|
327
|
+
log=log,
|
|
328
|
+
log_category=log_category,
|
|
291
329
|
)
|
|
292
330
|
|
|
293
331
|
def info(
|
|
@@ -297,9 +335,19 @@ class SourceReport(Report):
|
|
|
297
335
|
title: Optional[LiteralString] = None,
|
|
298
336
|
exc: Optional[BaseException] = None,
|
|
299
337
|
log: bool = True,
|
|
338
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
300
339
|
) -> None:
|
|
340
|
+
"""
|
|
341
|
+
See docs of StructuredLogs.report_log for details of args
|
|
342
|
+
"""
|
|
301
343
|
self._structured_logs.report_log(
|
|
302
|
-
StructuredLogLevel.INFO,
|
|
344
|
+
StructuredLogLevel.INFO,
|
|
345
|
+
message,
|
|
346
|
+
title,
|
|
347
|
+
context,
|
|
348
|
+
exc,
|
|
349
|
+
log=log,
|
|
350
|
+
log_category=log_category,
|
|
303
351
|
)
|
|
304
352
|
|
|
305
353
|
@contextlib.contextmanager
|
|
@@ -309,6 +357,7 @@ class SourceReport(Report):
|
|
|
309
357
|
title: Optional[LiteralString] = None,
|
|
310
358
|
context: Optional[str] = None,
|
|
311
359
|
level: StructuredLogLevel = StructuredLogLevel.ERROR,
|
|
360
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
312
361
|
) -> Iterator[None]:
|
|
313
362
|
# Convenience method that helps avoid boilerplate try/except blocks.
|
|
314
363
|
# TODO: I'm not super happy with the naming here - it's not obvious that this
|
|
@@ -317,10 +366,16 @@ class SourceReport(Report):
|
|
|
317
366
|
yield
|
|
318
367
|
except Exception as exc:
|
|
319
368
|
self._structured_logs.report_log(
|
|
320
|
-
level,
|
|
369
|
+
level,
|
|
370
|
+
message=message,
|
|
371
|
+
title=title,
|
|
372
|
+
context=context,
|
|
373
|
+
exc=exc,
|
|
374
|
+
log_category=log_category,
|
|
321
375
|
)
|
|
322
376
|
|
|
323
377
|
def __post_init__(self) -> None:
|
|
378
|
+
super().__post_init__()
|
|
324
379
|
self.start_time = datetime.datetime.now()
|
|
325
380
|
self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
|
|
326
381
|
|
|
@@ -333,6 +388,43 @@ class SourceReport(Report):
|
|
|
333
388
|
"infos": Report.to_pure_python_obj(self.infos),
|
|
334
389
|
}
|
|
335
390
|
|
|
391
|
+
@staticmethod
|
|
392
|
+
def _discretize_dict_values(
|
|
393
|
+
nested_dict: Dict[str, Dict[str, int]],
|
|
394
|
+
) -> Dict[str, Dict[str, int]]:
|
|
395
|
+
"""Helper method to discretize values in a nested dictionary structure."""
|
|
396
|
+
result = {}
|
|
397
|
+
for outer_key, inner_dict in nested_dict.items():
|
|
398
|
+
discretized_dict: Dict[str, int] = {}
|
|
399
|
+
for inner_key, count in inner_dict.items():
|
|
400
|
+
discretized_dict[inner_key] = stats.discretize(count)
|
|
401
|
+
result[outer_key] = discretized_dict
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
|
|
405
|
+
"""Convert the nested defaultdict aspects to a regular dict for serialization."""
|
|
406
|
+
return self._discretize_dict_values(self.aspects)
|
|
407
|
+
|
|
408
|
+
def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
409
|
+
"""Get aspect counts grouped by entity type and subtype."""
|
|
410
|
+
return self._discretize_dict_values_nested(self.aspects_by_subtypes)
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def _discretize_dict_values_nested(
|
|
414
|
+
nested_dict: Dict[str, Dict[str, Dict[str, int]]],
|
|
415
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
416
|
+
"""Helper method to discretize values in a nested dictionary structure with three levels."""
|
|
417
|
+
result = {}
|
|
418
|
+
for outer_key, middle_dict in nested_dict.items():
|
|
419
|
+
discretized_middle_dict: Dict[str, Dict[str, int]] = {}
|
|
420
|
+
for middle_key, inner_dict in middle_dict.items():
|
|
421
|
+
discretized_inner_dict: Dict[str, int] = {}
|
|
422
|
+
for inner_key, count in inner_dict.items():
|
|
423
|
+
discretized_inner_dict[inner_key] = stats.discretize(count)
|
|
424
|
+
discretized_middle_dict[middle_key] = discretized_inner_dict
|
|
425
|
+
result[outer_key] = discretized_middle_dict
|
|
426
|
+
return result
|
|
427
|
+
|
|
336
428
|
def compute_stats(self) -> None:
|
|
337
429
|
super().compute_stats()
|
|
338
430
|
|
|
@@ -416,12 +508,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
416
508
|
Run in order, first in list is applied first. Be careful with order when overriding.
|
|
417
509
|
"""
|
|
418
510
|
browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
|
|
419
|
-
if
|
|
420
|
-
self.ctx.pipeline_config
|
|
421
|
-
and self.ctx.pipeline_config.flags.generate_browse_path_v2
|
|
422
|
-
):
|
|
511
|
+
if self.ctx.flags.generate_browse_path_v2:
|
|
423
512
|
browse_path_processor = self._get_browse_path_processor(
|
|
424
|
-
self.ctx.
|
|
513
|
+
self.ctx.flags.generate_browse_path_v2_dry_run
|
|
425
514
|
)
|
|
426
515
|
|
|
427
516
|
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
@@ -452,12 +541,13 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
452
541
|
auto_status_aspect,
|
|
453
542
|
auto_materialize_referenced_tags_terms,
|
|
454
543
|
partial(
|
|
455
|
-
auto_fix_duplicate_schema_field_paths, platform=self.
|
|
544
|
+
auto_fix_duplicate_schema_field_paths, platform=self.infer_platform()
|
|
456
545
|
),
|
|
457
|
-
partial(auto_fix_empty_field_paths, platform=self.
|
|
546
|
+
partial(auto_fix_empty_field_paths, platform=self.infer_platform()),
|
|
458
547
|
browse_path_processor,
|
|
459
548
|
partial(auto_workunit_reporter, self.get_report()),
|
|
460
549
|
auto_patch_last_modified,
|
|
550
|
+
ValidateInputFieldsProcessor(self.get_report()).validate_input_fields,
|
|
461
551
|
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
462
552
|
]
|
|
463
553
|
|
|
@@ -472,11 +562,33 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
472
562
|
return stream
|
|
473
563
|
|
|
474
564
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
475
|
-
|
|
476
|
-
|
|
565
|
+
workunit_processors = self.get_workunit_processors()
|
|
566
|
+
workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
|
|
567
|
+
# Process main workunits
|
|
568
|
+
yield from self._apply_workunit_processors(
|
|
569
|
+
workunit_processors, auto_workunit(self.get_workunits_internal())
|
|
477
570
|
)
|
|
571
|
+
# Process profiling workunits
|
|
572
|
+
yield from self._process_profiling_stage(workunit_processors)
|
|
573
|
+
|
|
574
|
+
def _process_profiling_stage(
|
|
575
|
+
self, processors: List[Optional[MetadataWorkUnitProcessor]]
|
|
576
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
577
|
+
"""Process profiling stage if source supports it."""
|
|
578
|
+
if (
|
|
579
|
+
not isinstance(self, ProfilingCapable)
|
|
580
|
+
or not self.is_profiling_enabled_internal()
|
|
581
|
+
):
|
|
582
|
+
return
|
|
583
|
+
with self.get_report().new_high_stage(IngestionHighStage.PROFILING):
|
|
584
|
+
profiling_stream = self._apply_workunit_processors(
|
|
585
|
+
processors, auto_workunit(self.get_profiling_internal())
|
|
586
|
+
)
|
|
587
|
+
yield from profiling_stream
|
|
478
588
|
|
|
479
|
-
def get_workunits_internal(
|
|
589
|
+
def get_workunits_internal(
|
|
590
|
+
self,
|
|
591
|
+
) -> MetadataWorkUnitIterable:
|
|
480
592
|
raise NotImplementedError(
|
|
481
593
|
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
|
482
594
|
)
|
|
@@ -498,9 +610,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
498
610
|
pass
|
|
499
611
|
|
|
500
612
|
def close(self) -> None:
|
|
501
|
-
|
|
613
|
+
self.get_report().close()
|
|
502
614
|
|
|
503
|
-
def
|
|
615
|
+
def infer_platform(self) -> Optional[str]:
|
|
504
616
|
config = self.get_config()
|
|
505
617
|
platform = (
|
|
506
618
|
getattr(config, "platform_name", None)
|
|
@@ -515,7 +627,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
515
627
|
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
|
|
516
628
|
config = self.get_config()
|
|
517
629
|
|
|
518
|
-
platform = self.
|
|
630
|
+
platform = self.infer_platform()
|
|
519
631
|
env = getattr(config, "env", None)
|
|
520
632
|
browse_path_drop_dirs = [
|
|
521
633
|
platform,
|
|
@@ -13,9 +13,14 @@ from typing import (
|
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
16
|
-
from datahub.emitter.mce_builder import
|
|
16
|
+
from datahub.emitter.mce_builder import (
|
|
17
|
+
get_sys_time,
|
|
18
|
+
make_dataplatform_instance_urn,
|
|
19
|
+
parse_ts_millis,
|
|
20
|
+
)
|
|
17
21
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
22
|
from datahub.emitter.mcp_builder import entity_supports_aspect
|
|
23
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
19
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
25
|
from datahub.metadata.schema_classes import (
|
|
21
26
|
BrowsePathEntryClass,
|
|
@@ -35,6 +40,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
40
|
TimeWindowSizeClass,
|
|
36
41
|
)
|
|
37
42
|
from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
|
|
43
|
+
from datahub.sdk.entity import Entity
|
|
38
44
|
from datahub.specific.dataset import DatasetPatchBuilder
|
|
39
45
|
from datahub.telemetry import telemetry
|
|
40
46
|
from datahub.utilities.urns.error import InvalidUrnError
|
|
@@ -48,7 +54,14 @@ logger = logging.getLogger(__name__)
|
|
|
48
54
|
|
|
49
55
|
|
|
50
56
|
def auto_workunit(
|
|
51
|
-
stream: Iterable[
|
|
57
|
+
stream: Iterable[
|
|
58
|
+
Union[
|
|
59
|
+
MetadataChangeEventClass,
|
|
60
|
+
MetadataChangeProposalWrapper,
|
|
61
|
+
MetadataWorkUnit,
|
|
62
|
+
Entity,
|
|
63
|
+
]
|
|
64
|
+
],
|
|
52
65
|
) -> Iterable[MetadataWorkUnit]:
|
|
53
66
|
"""Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
|
|
54
67
|
|
|
@@ -58,8 +71,12 @@ def auto_workunit(
|
|
|
58
71
|
id=MetadataWorkUnit.generate_workunit_id(item),
|
|
59
72
|
mce=item,
|
|
60
73
|
)
|
|
61
|
-
|
|
74
|
+
elif isinstance(item, MetadataChangeProposalWrapper):
|
|
62
75
|
yield item.as_workunit()
|
|
76
|
+
elif isinstance(item, Entity):
|
|
77
|
+
yield from item.as_workunits()
|
|
78
|
+
else:
|
|
79
|
+
yield item
|
|
63
80
|
|
|
64
81
|
|
|
65
82
|
def create_dataset_props_patch_builder(
|
|
@@ -75,6 +92,7 @@ def create_dataset_props_patch_builder(
|
|
|
75
92
|
patch_builder.set_last_modified(dataset_properties.lastModified)
|
|
76
93
|
patch_builder.set_qualified_name(dataset_properties.qualifiedName)
|
|
77
94
|
patch_builder.add_custom_properties(dataset_properties.customProperties)
|
|
95
|
+
patch_builder.set_external_url(dataset_properties.externalUrl)
|
|
78
96
|
|
|
79
97
|
return patch_builder
|
|
80
98
|
|
|
@@ -532,3 +550,23 @@ def _prepend_platform_instance(
|
|
|
532
550
|
return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
|
|
533
551
|
|
|
534
552
|
return entries
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
class AutoSystemMetadata:
|
|
556
|
+
def __init__(self, ctx: PipelineContext):
|
|
557
|
+
self.ctx = ctx
|
|
558
|
+
|
|
559
|
+
def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
|
|
560
|
+
for wu in stream:
|
|
561
|
+
yield self.stamp_wu(wu)
|
|
562
|
+
|
|
563
|
+
def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
|
|
564
|
+
if self.ctx.flags.set_system_metadata:
|
|
565
|
+
if not wu.metadata.systemMetadata:
|
|
566
|
+
wu.metadata.systemMetadata = SystemMetadataClass()
|
|
567
|
+
wu.metadata.systemMetadata.runId = self.ctx.run_id
|
|
568
|
+
if not wu.metadata.systemMetadata.lastObserved:
|
|
569
|
+
wu.metadata.systemMetadata.lastObserved = get_sys_time()
|
|
570
|
+
if self.ctx.flags.set_system_metadata_pipeline_name:
|
|
571
|
+
wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
|
|
572
|
+
return wu
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Iterable, Protocol, Union, runtime_checkable
|
|
2
|
+
|
|
3
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
4
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
5
|
+
from datahub.sdk.entity import Entity
|
|
6
|
+
|
|
7
|
+
# Type alias for metadata work units - Python 3.9 compatible
|
|
8
|
+
MetadataWorkUnitIterable = Iterable[
|
|
9
|
+
Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@runtime_checkable
|
|
14
|
+
class ProfilingCapable(Protocol):
|
|
15
|
+
"""Protocol for sources that support profiling functionality."""
|
|
16
|
+
|
|
17
|
+
def is_profiling_enabled_internal(self) -> bool:
|
|
18
|
+
"""Check if profiling is enabled for this source."""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
def get_profiling_internal(self) -> MetadataWorkUnitIterable:
|
|
22
|
+
"""Generate profiling work units."""
|
|
23
|
+
...
|
|
File without changes
|