acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/source.py
CHANGED
|
@@ -2,7 +2,6 @@ import contextlib
|
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from enum import Enum
|
|
8
7
|
from functools import partial
|
|
@@ -15,7 +14,6 @@ from typing import (
|
|
|
15
14
|
List,
|
|
16
15
|
Optional,
|
|
17
16
|
Sequence,
|
|
18
|
-
Set,
|
|
19
17
|
Type,
|
|
20
18
|
TypeVar,
|
|
21
19
|
Union,
|
|
@@ -27,17 +25,18 @@ from typing_extensions import LiteralString, Self
|
|
|
27
25
|
|
|
28
26
|
from datahub.configuration.common import ConfigModel
|
|
29
27
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
30
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
31
|
-
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
32
28
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
33
29
|
auto_patch_last_modified,
|
|
34
30
|
)
|
|
35
31
|
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
36
32
|
EnsureAspectSizeProcessor,
|
|
37
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_validate_input_fields import (
|
|
35
|
+
ValidateInputFieldsProcessor,
|
|
36
|
+
)
|
|
38
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
39
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
40
|
-
from datahub.ingestion.api.report import Report
|
|
39
|
+
from datahub.ingestion.api.report import ExamplesReport, Report
|
|
41
40
|
from datahub.ingestion.api.source_helpers import (
|
|
42
41
|
AutoSystemMetadata,
|
|
43
42
|
auto_browse_path_v2,
|
|
@@ -49,10 +48,16 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
49
48
|
auto_workunit,
|
|
50
49
|
auto_workunit_reporter,
|
|
51
50
|
)
|
|
51
|
+
from datahub.ingestion.api.source_protocols import (
|
|
52
|
+
MetadataWorkUnitIterable,
|
|
53
|
+
ProfilingCapable,
|
|
54
|
+
)
|
|
52
55
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
53
|
-
from datahub.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
57
|
+
IngestionHighStage,
|
|
58
|
+
IngestionStageReport,
|
|
59
|
+
)
|
|
60
|
+
from datahub.telemetry import stats
|
|
56
61
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
57
62
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
58
63
|
|
|
@@ -76,6 +81,7 @@ class SourceCapability(Enum):
|
|
|
76
81
|
SCHEMA_METADATA = "Schema Metadata"
|
|
77
82
|
CONTAINERS = "Asset Containers"
|
|
78
83
|
CLASSIFICATION = "Classification"
|
|
84
|
+
TEST_CONNECTION = "Test Connection"
|
|
79
85
|
|
|
80
86
|
|
|
81
87
|
class StructuredLogLevel(Enum):
|
|
@@ -84,11 +90,24 @@ class StructuredLogLevel(Enum):
|
|
|
84
90
|
ERROR = logging.ERROR
|
|
85
91
|
|
|
86
92
|
|
|
93
|
+
class StructuredLogCategory(Enum):
|
|
94
|
+
"""
|
|
95
|
+
This is used to categorise the errors mainly based on the biggest impact area
|
|
96
|
+
This is to be used to help in self-serve understand the impact of any log entry
|
|
97
|
+
More enums to be added as logs are updated to be self-serve
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
LINEAGE = "LINEAGE"
|
|
101
|
+
USAGE = "USAGE"
|
|
102
|
+
PROFILING = "PROFILING"
|
|
103
|
+
|
|
104
|
+
|
|
87
105
|
@dataclass
|
|
88
106
|
class StructuredLogEntry(Report):
|
|
89
107
|
title: Optional[str]
|
|
90
108
|
message: str
|
|
91
109
|
context: LossyList[str]
|
|
110
|
+
log_category: Optional[StructuredLogCategory] = None
|
|
92
111
|
|
|
93
112
|
|
|
94
113
|
@dataclass
|
|
@@ -111,9 +130,10 @@ class StructuredLogs(Report):
|
|
|
111
130
|
exc: Optional[BaseException] = None,
|
|
112
131
|
log: bool = False,
|
|
113
132
|
stacklevel: int = 1,
|
|
133
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
114
134
|
) -> None:
|
|
115
135
|
"""
|
|
116
|
-
Report a user-facing
|
|
136
|
+
Report a user-facing log for the ingestion run.
|
|
117
137
|
|
|
118
138
|
Args:
|
|
119
139
|
level: The level of the log entry.
|
|
@@ -121,6 +141,9 @@ class StructuredLogs(Report):
|
|
|
121
141
|
title: The category / heading to present on for this message in the UI.
|
|
122
142
|
context: Additional context (e.g. where, how) for the log entry.
|
|
123
143
|
exc: The exception associated with the event. We'll show the stack trace when in debug mode.
|
|
144
|
+
log_category: The type of the log entry. This is used to categorise the log entry.
|
|
145
|
+
log: Whether to log the entry to the console.
|
|
146
|
+
stacklevel: The stack level to use for the log entry.
|
|
124
147
|
"""
|
|
125
148
|
|
|
126
149
|
# One for this method, and one for the containing report_* call.
|
|
@@ -163,6 +186,7 @@ class StructuredLogs(Report):
|
|
|
163
186
|
title=title,
|
|
164
187
|
message=message,
|
|
165
188
|
context=context_list,
|
|
189
|
+
log_category=log_category,
|
|
166
190
|
)
|
|
167
191
|
else:
|
|
168
192
|
if context is not None:
|
|
@@ -190,19 +214,11 @@ class StructuredLogs(Report):
|
|
|
190
214
|
|
|
191
215
|
|
|
192
216
|
@dataclass
|
|
193
|
-
class SourceReport(
|
|
217
|
+
class SourceReport(ExamplesReport, IngestionStageReport):
|
|
194
218
|
event_not_produced_warn: bool = True
|
|
195
219
|
events_produced: int = 0
|
|
196
220
|
events_produced_per_sec: int = 0
|
|
197
|
-
|
|
198
|
-
_urns_seen: Set[str] = field(default_factory=set)
|
|
199
|
-
entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
|
|
200
|
-
aspects: Dict[str, Dict[str, int]] = field(
|
|
201
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
202
|
-
)
|
|
203
|
-
aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
|
|
204
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
|
|
205
|
-
)
|
|
221
|
+
num_input_fields_filtered: int = 0
|
|
206
222
|
|
|
207
223
|
_structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
|
|
208
224
|
|
|
@@ -220,33 +236,10 @@ class SourceReport(Report):
|
|
|
220
236
|
|
|
221
237
|
def report_workunit(self, wu: WorkUnit) -> None:
|
|
222
238
|
self.events_produced += 1
|
|
239
|
+
if not isinstance(wu, MetadataWorkUnit):
|
|
240
|
+
return
|
|
223
241
|
|
|
224
|
-
|
|
225
|
-
urn = wu.get_urn()
|
|
226
|
-
|
|
227
|
-
# Specialized entity reporting.
|
|
228
|
-
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
229
|
-
mcps = [wu.metadata]
|
|
230
|
-
else:
|
|
231
|
-
mcps = list(mcps_from_mce(wu.metadata))
|
|
232
|
-
|
|
233
|
-
for mcp in mcps:
|
|
234
|
-
entityType = mcp.entityType
|
|
235
|
-
aspectName = mcp.aspectName
|
|
236
|
-
|
|
237
|
-
if urn not in self._urns_seen:
|
|
238
|
-
self._urns_seen.add(urn)
|
|
239
|
-
self.entities[entityType].append(urn)
|
|
240
|
-
|
|
241
|
-
if aspectName is not None: # usually true
|
|
242
|
-
self.aspects[entityType][aspectName] += 1
|
|
243
|
-
self.aspect_urn_samples[entityType][aspectName].append(urn)
|
|
244
|
-
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
245
|
-
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
246
|
-
if upstream_lineage.fineGrainedLineages:
|
|
247
|
-
self.aspect_urn_samples[entityType][
|
|
248
|
-
"fineGrainedLineages"
|
|
249
|
-
].append(urn)
|
|
242
|
+
super()._store_workunit_data(wu)
|
|
250
243
|
|
|
251
244
|
def report_warning(
|
|
252
245
|
self,
|
|
@@ -254,9 +247,19 @@ class SourceReport(Report):
|
|
|
254
247
|
context: Optional[str] = None,
|
|
255
248
|
title: Optional[LiteralString] = None,
|
|
256
249
|
exc: Optional[BaseException] = None,
|
|
250
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
257
251
|
) -> None:
|
|
252
|
+
"""
|
|
253
|
+
See docs of StructuredLogs.report_log for details of args
|
|
254
|
+
"""
|
|
258
255
|
self._structured_logs.report_log(
|
|
259
|
-
StructuredLogLevel.WARN,
|
|
256
|
+
StructuredLogLevel.WARN,
|
|
257
|
+
message,
|
|
258
|
+
title,
|
|
259
|
+
context,
|
|
260
|
+
exc,
|
|
261
|
+
log=False,
|
|
262
|
+
log_category=log_category,
|
|
260
263
|
)
|
|
261
264
|
|
|
262
265
|
def warning(
|
|
@@ -265,9 +268,20 @@ class SourceReport(Report):
|
|
|
265
268
|
context: Optional[str] = None,
|
|
266
269
|
title: Optional[LiteralString] = None,
|
|
267
270
|
exc: Optional[BaseException] = None,
|
|
271
|
+
log: bool = True,
|
|
272
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
268
273
|
) -> None:
|
|
274
|
+
"""
|
|
275
|
+
See docs of StructuredLogs.report_log for details of args
|
|
276
|
+
"""
|
|
269
277
|
self._structured_logs.report_log(
|
|
270
|
-
StructuredLogLevel.WARN,
|
|
278
|
+
StructuredLogLevel.WARN,
|
|
279
|
+
message,
|
|
280
|
+
title,
|
|
281
|
+
context,
|
|
282
|
+
exc,
|
|
283
|
+
log=log,
|
|
284
|
+
log_category=log_category,
|
|
271
285
|
)
|
|
272
286
|
|
|
273
287
|
def report_failure(
|
|
@@ -277,9 +291,19 @@ class SourceReport(Report):
|
|
|
277
291
|
title: Optional[LiteralString] = None,
|
|
278
292
|
exc: Optional[BaseException] = None,
|
|
279
293
|
log: bool = True,
|
|
294
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
280
295
|
) -> None:
|
|
296
|
+
"""
|
|
297
|
+
See docs of StructuredLogs.report_log for details of args
|
|
298
|
+
"""
|
|
281
299
|
self._structured_logs.report_log(
|
|
282
|
-
StructuredLogLevel.ERROR,
|
|
300
|
+
StructuredLogLevel.ERROR,
|
|
301
|
+
message,
|
|
302
|
+
title,
|
|
303
|
+
context,
|
|
304
|
+
exc,
|
|
305
|
+
log=log,
|
|
306
|
+
log_category=log_category,
|
|
283
307
|
)
|
|
284
308
|
|
|
285
309
|
def failure(
|
|
@@ -289,9 +313,19 @@ class SourceReport(Report):
|
|
|
289
313
|
title: Optional[LiteralString] = None,
|
|
290
314
|
exc: Optional[BaseException] = None,
|
|
291
315
|
log: bool = True,
|
|
316
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
292
317
|
) -> None:
|
|
318
|
+
"""
|
|
319
|
+
See docs of StructuredLogs.report_log for details of args
|
|
320
|
+
"""
|
|
293
321
|
self._structured_logs.report_log(
|
|
294
|
-
StructuredLogLevel.ERROR,
|
|
322
|
+
StructuredLogLevel.ERROR,
|
|
323
|
+
message,
|
|
324
|
+
title,
|
|
325
|
+
context,
|
|
326
|
+
exc,
|
|
327
|
+
log=log,
|
|
328
|
+
log_category=log_category,
|
|
295
329
|
)
|
|
296
330
|
|
|
297
331
|
def info(
|
|
@@ -301,9 +335,19 @@ class SourceReport(Report):
|
|
|
301
335
|
title: Optional[LiteralString] = None,
|
|
302
336
|
exc: Optional[BaseException] = None,
|
|
303
337
|
log: bool = True,
|
|
338
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
304
339
|
) -> None:
|
|
340
|
+
"""
|
|
341
|
+
See docs of StructuredLogs.report_log for details of args
|
|
342
|
+
"""
|
|
305
343
|
self._structured_logs.report_log(
|
|
306
|
-
StructuredLogLevel.INFO,
|
|
344
|
+
StructuredLogLevel.INFO,
|
|
345
|
+
message,
|
|
346
|
+
title,
|
|
347
|
+
context,
|
|
348
|
+
exc,
|
|
349
|
+
log=log,
|
|
350
|
+
log_category=log_category,
|
|
307
351
|
)
|
|
308
352
|
|
|
309
353
|
@contextlib.contextmanager
|
|
@@ -313,6 +357,7 @@ class SourceReport(Report):
|
|
|
313
357
|
title: Optional[LiteralString] = None,
|
|
314
358
|
context: Optional[str] = None,
|
|
315
359
|
level: StructuredLogLevel = StructuredLogLevel.ERROR,
|
|
360
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
316
361
|
) -> Iterator[None]:
|
|
317
362
|
# Convenience method that helps avoid boilerplate try/except blocks.
|
|
318
363
|
# TODO: I'm not super happy with the naming here - it's not obvious that this
|
|
@@ -321,10 +366,16 @@ class SourceReport(Report):
|
|
|
321
366
|
yield
|
|
322
367
|
except Exception as exc:
|
|
323
368
|
self._structured_logs.report_log(
|
|
324
|
-
level,
|
|
369
|
+
level,
|
|
370
|
+
message=message,
|
|
371
|
+
title=title,
|
|
372
|
+
context=context,
|
|
373
|
+
exc=exc,
|
|
374
|
+
log_category=log_category,
|
|
325
375
|
)
|
|
326
376
|
|
|
327
377
|
def __post_init__(self) -> None:
|
|
378
|
+
super().__post_init__()
|
|
328
379
|
self.start_time = datetime.datetime.now()
|
|
329
380
|
self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
|
|
330
381
|
|
|
@@ -337,6 +388,43 @@ class SourceReport(Report):
|
|
|
337
388
|
"infos": Report.to_pure_python_obj(self.infos),
|
|
338
389
|
}
|
|
339
390
|
|
|
391
|
+
@staticmethod
|
|
392
|
+
def _discretize_dict_values(
|
|
393
|
+
nested_dict: Dict[str, Dict[str, int]],
|
|
394
|
+
) -> Dict[str, Dict[str, int]]:
|
|
395
|
+
"""Helper method to discretize values in a nested dictionary structure."""
|
|
396
|
+
result = {}
|
|
397
|
+
for outer_key, inner_dict in nested_dict.items():
|
|
398
|
+
discretized_dict: Dict[str, int] = {}
|
|
399
|
+
for inner_key, count in inner_dict.items():
|
|
400
|
+
discretized_dict[inner_key] = stats.discretize(count)
|
|
401
|
+
result[outer_key] = discretized_dict
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
|
|
405
|
+
"""Convert the nested defaultdict aspects to a regular dict for serialization."""
|
|
406
|
+
return self._discretize_dict_values(self.aspects)
|
|
407
|
+
|
|
408
|
+
def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
409
|
+
"""Get aspect counts grouped by entity type and subtype."""
|
|
410
|
+
return self._discretize_dict_values_nested(self.aspects_by_subtypes)
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def _discretize_dict_values_nested(
|
|
414
|
+
nested_dict: Dict[str, Dict[str, Dict[str, int]]],
|
|
415
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
416
|
+
"""Helper method to discretize values in a nested dictionary structure with three levels."""
|
|
417
|
+
result = {}
|
|
418
|
+
for outer_key, middle_dict in nested_dict.items():
|
|
419
|
+
discretized_middle_dict: Dict[str, Dict[str, int]] = {}
|
|
420
|
+
for middle_key, inner_dict in middle_dict.items():
|
|
421
|
+
discretized_inner_dict: Dict[str, int] = {}
|
|
422
|
+
for inner_key, count in inner_dict.items():
|
|
423
|
+
discretized_inner_dict[inner_key] = stats.discretize(count)
|
|
424
|
+
discretized_middle_dict[middle_key] = discretized_inner_dict
|
|
425
|
+
result[outer_key] = discretized_middle_dict
|
|
426
|
+
return result
|
|
427
|
+
|
|
340
428
|
def compute_stats(self) -> None:
|
|
341
429
|
super().compute_stats()
|
|
342
430
|
|
|
@@ -453,12 +541,13 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
453
541
|
auto_status_aspect,
|
|
454
542
|
auto_materialize_referenced_tags_terms,
|
|
455
543
|
partial(
|
|
456
|
-
auto_fix_duplicate_schema_field_paths, platform=self.
|
|
544
|
+
auto_fix_duplicate_schema_field_paths, platform=self.infer_platform()
|
|
457
545
|
),
|
|
458
|
-
partial(auto_fix_empty_field_paths, platform=self.
|
|
546
|
+
partial(auto_fix_empty_field_paths, platform=self.infer_platform()),
|
|
459
547
|
browse_path_processor,
|
|
460
548
|
partial(auto_workunit_reporter, self.get_report()),
|
|
461
549
|
auto_patch_last_modified,
|
|
550
|
+
ValidateInputFieldsProcessor(self.get_report()).validate_input_fields,
|
|
462
551
|
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
463
552
|
]
|
|
464
553
|
|
|
@@ -475,13 +564,31 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
475
564
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
476
565
|
workunit_processors = self.get_workunit_processors()
|
|
477
566
|
workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
|
|
478
|
-
|
|
567
|
+
# Process main workunits
|
|
568
|
+
yield from self._apply_workunit_processors(
|
|
479
569
|
workunit_processors, auto_workunit(self.get_workunits_internal())
|
|
480
570
|
)
|
|
571
|
+
# Process profiling workunits
|
|
572
|
+
yield from self._process_profiling_stage(workunit_processors)
|
|
573
|
+
|
|
574
|
+
def _process_profiling_stage(
|
|
575
|
+
self, processors: List[Optional[MetadataWorkUnitProcessor]]
|
|
576
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
577
|
+
"""Process profiling stage if source supports it."""
|
|
578
|
+
if (
|
|
579
|
+
not isinstance(self, ProfilingCapable)
|
|
580
|
+
or not self.is_profiling_enabled_internal()
|
|
581
|
+
):
|
|
582
|
+
return
|
|
583
|
+
with self.get_report().new_high_stage(IngestionHighStage.PROFILING):
|
|
584
|
+
profiling_stream = self._apply_workunit_processors(
|
|
585
|
+
processors, auto_workunit(self.get_profiling_internal())
|
|
586
|
+
)
|
|
587
|
+
yield from profiling_stream
|
|
481
588
|
|
|
482
589
|
def get_workunits_internal(
|
|
483
590
|
self,
|
|
484
|
-
) ->
|
|
591
|
+
) -> MetadataWorkUnitIterable:
|
|
485
592
|
raise NotImplementedError(
|
|
486
593
|
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
|
487
594
|
)
|
|
@@ -503,9 +610,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
503
610
|
pass
|
|
504
611
|
|
|
505
612
|
def close(self) -> None:
|
|
506
|
-
|
|
613
|
+
self.get_report().close()
|
|
507
614
|
|
|
508
|
-
def
|
|
615
|
+
def infer_platform(self) -> Optional[str]:
|
|
509
616
|
config = self.get_config()
|
|
510
617
|
platform = (
|
|
511
618
|
getattr(config, "platform_name", None)
|
|
@@ -520,7 +627,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
520
627
|
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
|
|
521
628
|
config = self.get_config()
|
|
522
629
|
|
|
523
|
-
platform = self.
|
|
630
|
+
platform = self.infer_platform()
|
|
524
631
|
env = getattr(config, "env", None)
|
|
525
632
|
browse_path_drop_dirs = [
|
|
526
633
|
platform,
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Iterable, Protocol, Union, runtime_checkable
|
|
2
|
+
|
|
3
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
4
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
5
|
+
from datahub.sdk.entity import Entity
|
|
6
|
+
|
|
7
|
+
# Type alias for metadata work units - Python 3.9 compatible
|
|
8
|
+
MetadataWorkUnitIterable = Iterable[
|
|
9
|
+
Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@runtime_checkable
|
|
14
|
+
class ProfilingCapable(Protocol):
|
|
15
|
+
"""Protocol for sources that support profiling functionality."""
|
|
16
|
+
|
|
17
|
+
def is_profiling_enabled_internal(self) -> bool:
|
|
18
|
+
"""Check if profiling is enabled for this source."""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
def get_profiling_internal(self) -> MetadataWorkUnitIterable:
|
|
22
|
+
"""Generate profiling work units."""
|
|
23
|
+
...
|
|
File without changes
|