acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from typing import TYPE_CHECKING, Iterable, List
|
|
4
5
|
|
|
5
6
|
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
@@ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform
|
|
|
7
8
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
8
9
|
from datahub.metadata.schema_classes import (
|
|
9
10
|
DatasetProfileClass,
|
|
11
|
+
QueryPropertiesClass,
|
|
12
|
+
QuerySubjectsClass,
|
|
10
13
|
SchemaFieldClass,
|
|
11
14
|
SchemaMetadataClass,
|
|
15
|
+
UpstreamLineageClass,
|
|
12
16
|
)
|
|
13
17
|
|
|
14
18
|
if TYPE_CHECKING:
|
|
15
19
|
from datahub.ingestion.api.source import SourceReport
|
|
16
20
|
|
|
21
|
+
|
|
22
|
+
# TODO: ordering
|
|
23
|
+
# In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given
|
|
24
|
+
# those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically)
|
|
25
|
+
# so that the trimming is deterministic and predictable and more importantly consistent across executions.
|
|
26
|
+
# In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones,
|
|
27
|
+
# which may lead to poorly schema rendering in the UI.
|
|
28
|
+
|
|
17
29
|
logger = logging.getLogger(__name__)
|
|
18
30
|
|
|
31
|
+
DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024 # 5MB
|
|
32
|
+
QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int(
|
|
33
|
+
os.environ.get(
|
|
34
|
+
"QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES",
|
|
35
|
+
DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
QUERY_STATEMENT_TRUNCATION_BUFFER = 100
|
|
40
|
+
|
|
19
41
|
|
|
20
42
|
class EnsureAspectSizeProcessor:
|
|
21
43
|
def __init__(
|
|
@@ -81,6 +103,274 @@ class EnsureAspectSizeProcessor:
|
|
|
81
103
|
|
|
82
104
|
schema.fields = accepted_fields
|
|
83
105
|
|
|
106
|
+
def ensure_query_subjects_size(
|
|
107
|
+
self, entity_urn: str, query_subjects: QuerySubjectsClass
|
|
108
|
+
) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first,
|
|
111
|
+
then table lineage if necessary.
|
|
112
|
+
"""
|
|
113
|
+
if not query_subjects.subjects:
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
total_subjects_size = 0
|
|
117
|
+
accepted_table_level_subjects = []
|
|
118
|
+
accepted_column_level_subjects = []
|
|
119
|
+
column_level_subjects_with_sizes = []
|
|
120
|
+
table_level_subjects_with_sizes = []
|
|
121
|
+
|
|
122
|
+
# Separate column-level and table-level subjects
|
|
123
|
+
for subject in query_subjects.subjects:
|
|
124
|
+
subject_size = len(json.dumps(pre_json_transform(subject.to_obj())))
|
|
125
|
+
|
|
126
|
+
if subject.entity.startswith("urn:li:schemaField:"):
|
|
127
|
+
column_level_subjects_with_sizes.append((subject, subject_size))
|
|
128
|
+
else:
|
|
129
|
+
table_level_subjects_with_sizes.append((subject, subject_size))
|
|
130
|
+
|
|
131
|
+
# Once we find one that doesn't fit, stop everything else to prevent inconsistencies
|
|
132
|
+
first_skip_done = False
|
|
133
|
+
|
|
134
|
+
# First, try to include all table-level subjects
|
|
135
|
+
for subject, subject_size in table_level_subjects_with_sizes:
|
|
136
|
+
if total_subjects_size + subject_size < self.payload_constraint:
|
|
137
|
+
accepted_table_level_subjects.append(subject)
|
|
138
|
+
total_subjects_size += subject_size
|
|
139
|
+
else:
|
|
140
|
+
first_skip_done = True
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
# Then, add column-level subjects if there's remaining space
|
|
144
|
+
# Only process if we successfully included all table-level subjects
|
|
145
|
+
if not first_skip_done:
|
|
146
|
+
for subject, subject_size in column_level_subjects_with_sizes:
|
|
147
|
+
if total_subjects_size + subject_size < self.payload_constraint:
|
|
148
|
+
accepted_column_level_subjects.append(subject)
|
|
149
|
+
total_subjects_size += subject_size
|
|
150
|
+
else:
|
|
151
|
+
first_skip_done = True
|
|
152
|
+
break
|
|
153
|
+
|
|
154
|
+
if first_skip_done:
|
|
155
|
+
# Log aggregate warnings
|
|
156
|
+
table_level_skipped_count = len(table_level_subjects_with_sizes) - len(
|
|
157
|
+
accepted_table_level_subjects
|
|
158
|
+
)
|
|
159
|
+
column_level_skipped_count = len(column_level_subjects_with_sizes) - len(
|
|
160
|
+
accepted_column_level_subjects
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
self._maybe_warn_query_subjects(
|
|
164
|
+
entity_urn, table_level_skipped_count, "table-level lineage subjects"
|
|
165
|
+
)
|
|
166
|
+
self._maybe_warn_query_subjects(
|
|
167
|
+
entity_urn, column_level_skipped_count, "column-level lineage subjects"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
query_subjects.subjects = (
|
|
171
|
+
accepted_table_level_subjects + accepted_column_level_subjects
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def _maybe_warn_query_subjects(
|
|
175
|
+
self, entity_urn: str, skipped_count: int, item_type: str
|
|
176
|
+
) -> None:
|
|
177
|
+
"""Log warning for query subjects truncation if any items were skipped."""
|
|
178
|
+
if skipped_count > 0:
|
|
179
|
+
self.report.warning(
|
|
180
|
+
title="Query subjects truncated due to size constraint",
|
|
181
|
+
message="Query subjects contained too much data and would have caused ingestion to fail",
|
|
182
|
+
context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def _maybe_warn_upstream_lineage(
|
|
186
|
+
self, entity_urn: str, skipped_count: int, item_type: str
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Log warning for upstream lineage truncation if any items were skipped."""
|
|
189
|
+
if skipped_count > 0:
|
|
190
|
+
self.report.warning(
|
|
191
|
+
title="Upstream lineage truncated due to size constraint",
|
|
192
|
+
message="Upstream lineage contained too much data and would have caused ingestion to fail",
|
|
193
|
+
context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def ensure_upstream_lineage_size( # noqa: C901
|
|
197
|
+
self, entity_urn: str, upstream_lineage: UpstreamLineageClass
|
|
198
|
+
) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order:
|
|
201
|
+
first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages,
|
|
202
|
+
then DATASET fine-grained lineages, and finally upstreams (highest priority).
|
|
203
|
+
"""
|
|
204
|
+
if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
total_lineage_size = 0
|
|
208
|
+
accepted_upstreams = []
|
|
209
|
+
accepted_dataset_fg_lineages = []
|
|
210
|
+
accepted_field_set_fg_lineages = []
|
|
211
|
+
accepted_none_fg_lineages = []
|
|
212
|
+
upstream_items_with_sizes = []
|
|
213
|
+
dataset_fg_items_with_sizes = []
|
|
214
|
+
field_set_fg_items_with_sizes = []
|
|
215
|
+
none_fg_items_with_sizes = []
|
|
216
|
+
|
|
217
|
+
# Add upstreams (highest priority)
|
|
218
|
+
if upstream_lineage.upstreams:
|
|
219
|
+
for upstream in upstream_lineage.upstreams:
|
|
220
|
+
upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj())))
|
|
221
|
+
upstream_items_with_sizes.append((upstream, upstream_size))
|
|
222
|
+
|
|
223
|
+
# Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE
|
|
224
|
+
if upstream_lineage.fineGrainedLineages:
|
|
225
|
+
for fg_lineage in upstream_lineage.fineGrainedLineages:
|
|
226
|
+
fg_lineage_size = len(
|
|
227
|
+
json.dumps(pre_json_transform(fg_lineage.to_obj()))
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
upstream_type_str = str(fg_lineage.upstreamType)
|
|
231
|
+
if upstream_type_str == "DATASET":
|
|
232
|
+
dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
|
233
|
+
elif upstream_type_str == "FIELD_SET":
|
|
234
|
+
field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
|
235
|
+
elif upstream_type_str == "NONE":
|
|
236
|
+
none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
|
237
|
+
|
|
238
|
+
# Once we find one that doesn't fit, stop everything else to prevent inconsistencies
|
|
239
|
+
first_skip_done = False
|
|
240
|
+
|
|
241
|
+
# First, include all upstreams (highest priority)
|
|
242
|
+
for item, item_size in upstream_items_with_sizes:
|
|
243
|
+
if total_lineage_size + item_size < self.payload_constraint:
|
|
244
|
+
accepted_upstreams.append(item)
|
|
245
|
+
total_lineage_size += item_size
|
|
246
|
+
else:
|
|
247
|
+
first_skip_done = True
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
# Second, include DATASET fine-grained lineages if no upstreams were skipped
|
|
251
|
+
if not first_skip_done:
|
|
252
|
+
for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes:
|
|
253
|
+
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
|
254
|
+
accepted_dataset_fg_lineages.append(fg_lineage)
|
|
255
|
+
total_lineage_size += fg_lineage_size
|
|
256
|
+
else:
|
|
257
|
+
first_skip_done = True
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
# Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped
|
|
261
|
+
if not first_skip_done:
|
|
262
|
+
for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes:
|
|
263
|
+
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
|
264
|
+
accepted_field_set_fg_lineages.append(fg_lineage)
|
|
265
|
+
total_lineage_size += fg_lineage_size
|
|
266
|
+
else:
|
|
267
|
+
first_skip_done = True
|
|
268
|
+
break
|
|
269
|
+
|
|
270
|
+
# Finally, include NONE fine-grained lineages if no higher priority items were skipped
|
|
271
|
+
if not first_skip_done:
|
|
272
|
+
for fg_lineage, fg_lineage_size in none_fg_items_with_sizes:
|
|
273
|
+
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
|
274
|
+
accepted_none_fg_lineages.append(fg_lineage)
|
|
275
|
+
total_lineage_size += fg_lineage_size
|
|
276
|
+
else:
|
|
277
|
+
first_skip_done = True
|
|
278
|
+
break
|
|
279
|
+
|
|
280
|
+
# Log aggregate warnings instead of per-item warnings
|
|
281
|
+
if first_skip_done:
|
|
282
|
+
upstreams_skipped_count = len(upstream_items_with_sizes) - len(
|
|
283
|
+
accepted_upstreams
|
|
284
|
+
)
|
|
285
|
+
dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len(
|
|
286
|
+
accepted_dataset_fg_lineages
|
|
287
|
+
)
|
|
288
|
+
field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len(
|
|
289
|
+
accepted_field_set_fg_lineages
|
|
290
|
+
)
|
|
291
|
+
none_fg_skipped_count = len(none_fg_items_with_sizes) - len(
|
|
292
|
+
accepted_none_fg_lineages
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
self._maybe_warn_upstream_lineage(
|
|
296
|
+
entity_urn, upstreams_skipped_count, "upstream datasets"
|
|
297
|
+
)
|
|
298
|
+
self._maybe_warn_upstream_lineage(
|
|
299
|
+
entity_urn,
|
|
300
|
+
dataset_fg_skipped_count,
|
|
301
|
+
"dataset-level fine-grained lineages",
|
|
302
|
+
)
|
|
303
|
+
self._maybe_warn_upstream_lineage(
|
|
304
|
+
entity_urn,
|
|
305
|
+
field_set_fg_skipped_count,
|
|
306
|
+
"field-set-level fine-grained lineages",
|
|
307
|
+
)
|
|
308
|
+
self._maybe_warn_upstream_lineage(
|
|
309
|
+
entity_urn, none_fg_skipped_count, "none-level fine-grained lineages"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Combine all accepted fine-grained lineages
|
|
313
|
+
accepted_fine_grained_lineages = (
|
|
314
|
+
accepted_dataset_fg_lineages
|
|
315
|
+
+ accepted_field_set_fg_lineages
|
|
316
|
+
+ accepted_none_fg_lineages
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
upstream_lineage.upstreams = accepted_upstreams
|
|
320
|
+
upstream_lineage.fineGrainedLineages = (
|
|
321
|
+
accepted_fine_grained_lineages if accepted_fine_grained_lineages else None
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def ensure_query_properties_size(
|
|
325
|
+
self, entity_urn: str, query_properties: QueryPropertiesClass
|
|
326
|
+
) -> None:
|
|
327
|
+
"""
|
|
328
|
+
Ensure query properties aspect does not exceed allowed size by truncating the query statement value.
|
|
329
|
+
Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES
|
|
330
|
+
and INGEST_MAX_PAYLOAD_BYTES.
|
|
331
|
+
|
|
332
|
+
We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail;
|
|
333
|
+
that was INSERT INTO VALUES with huge list of values.
|
|
334
|
+
"""
|
|
335
|
+
if not query_properties.statement or not query_properties.statement.value:
|
|
336
|
+
return
|
|
337
|
+
|
|
338
|
+
max_payload_size = min(
|
|
339
|
+
QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
current_size = len(json.dumps(pre_json_transform(query_properties.to_obj())))
|
|
343
|
+
|
|
344
|
+
if current_size < max_payload_size:
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
reduction_needed = (
|
|
348
|
+
current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
statement_value_size = len(query_properties.statement.value)
|
|
352
|
+
original_statement_size = statement_value_size
|
|
353
|
+
|
|
354
|
+
# Only truncate if reduction is actually needed and possible
|
|
355
|
+
if statement_value_size > reduction_needed > 0:
|
|
356
|
+
new_statement_length = statement_value_size - reduction_needed
|
|
357
|
+
truncated_statement = query_properties.statement.value[
|
|
358
|
+
:new_statement_length
|
|
359
|
+
]
|
|
360
|
+
|
|
361
|
+
truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]"
|
|
362
|
+
query_properties.statement.value = truncated_statement + truncation_message
|
|
363
|
+
|
|
364
|
+
self.report.warning(
|
|
365
|
+
title="Query properties truncated due to size constraint",
|
|
366
|
+
message="Query properties contained too much data and would have caused ingestion to fail",
|
|
367
|
+
context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints",
|
|
368
|
+
)
|
|
369
|
+
else:
|
|
370
|
+
logger.warning(
|
|
371
|
+
f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement."
|
|
372
|
+
)
|
|
373
|
+
|
|
84
374
|
def ensure_aspect_size(
|
|
85
375
|
self,
|
|
86
376
|
stream: Iterable[MetadataWorkUnit],
|
|
@@ -90,10 +380,16 @@ class EnsureAspectSizeProcessor:
|
|
|
90
380
|
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
|
|
91
381
|
"""
|
|
92
382
|
for wu in stream:
|
|
93
|
-
logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
383
|
+
# logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
94
384
|
|
|
95
385
|
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
|
|
96
386
|
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
|
97
387
|
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
|
98
388
|
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
|
389
|
+
elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass):
|
|
390
|
+
self.ensure_query_subjects_size(wu.get_urn(), query_subjects)
|
|
391
|
+
elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass):
|
|
392
|
+
self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage)
|
|
393
|
+
elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass):
|
|
394
|
+
self.ensure_query_properties_size(wu.get_urn(), query_properties)
|
|
99
395
|
yield wu
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
5
|
+
from datahub.metadata.schema_classes import InputFieldClass, InputFieldsClass
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from datahub.ingestion.api.source import SourceReport
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ValidateInputFieldsProcessor:
|
|
14
|
+
def __init__(self, report: "SourceReport"):
|
|
15
|
+
self.report = report
|
|
16
|
+
|
|
17
|
+
def validate_input_fields(
|
|
18
|
+
self,
|
|
19
|
+
stream: Iterable[MetadataWorkUnit],
|
|
20
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
21
|
+
"""
|
|
22
|
+
Validate input fields and filter out invalid ones.
|
|
23
|
+
|
|
24
|
+
Invalid input fields have empty or missing fieldPath values, which would cause
|
|
25
|
+
URN generation to fail when sent to the server. This processor filters them out
|
|
26
|
+
and reports them as warnings.
|
|
27
|
+
"""
|
|
28
|
+
for wu in stream:
|
|
29
|
+
input_fields_aspect = wu.get_aspect_of_type(InputFieldsClass)
|
|
30
|
+
if input_fields_aspect and input_fields_aspect.fields:
|
|
31
|
+
valid_fields: List[InputFieldClass] = []
|
|
32
|
+
invalid_count = 0
|
|
33
|
+
|
|
34
|
+
for input_field in input_fields_aspect.fields:
|
|
35
|
+
if (
|
|
36
|
+
input_field.schemaField
|
|
37
|
+
and input_field.schemaField.fieldPath
|
|
38
|
+
and input_field.schemaField.fieldPath.strip()
|
|
39
|
+
):
|
|
40
|
+
valid_fields.append(input_field)
|
|
41
|
+
else:
|
|
42
|
+
invalid_count += 1
|
|
43
|
+
|
|
44
|
+
if invalid_count > 0:
|
|
45
|
+
logger.debug(
|
|
46
|
+
f"Filtered {invalid_count} invalid input field(s) with empty fieldPath for {wu.get_urn()}"
|
|
47
|
+
)
|
|
48
|
+
self.report.num_input_fields_filtered += invalid_count
|
|
49
|
+
self.report.warning(
|
|
50
|
+
title="Invalid input fields filtered",
|
|
51
|
+
message="Input fields with empty fieldPath values were filtered out to prevent ingestion errors",
|
|
52
|
+
context=f"Filtered {invalid_count} invalid input field(s) for {wu.get_urn()}",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Update the aspect with only valid fields
|
|
56
|
+
if valid_fields:
|
|
57
|
+
input_fields_aspect.fields = valid_fields
|
|
58
|
+
else:
|
|
59
|
+
# If no valid fields remain, skip this workunit entirely
|
|
60
|
+
logger.debug(
|
|
61
|
+
f"All input fields were invalid for {wu.get_urn()}, skipping InputFieldsClass workunit"
|
|
62
|
+
)
|
|
63
|
+
# Don't yield this workunit
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
yield wu
|
|
67
|
+
|
|
68
|
+
def _remove_input_fields_aspect(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
|
|
69
|
+
"""Remove InputFieldsClass aspect from a workunit."""
|
|
70
|
+
# For MCPs, we can simply not yield the aspect
|
|
71
|
+
# For MCEs, we need to remove it from the snapshot
|
|
72
|
+
if hasattr(wu.metadata, "aspect") and isinstance(
|
|
73
|
+
wu.metadata.aspect, InputFieldsClass
|
|
74
|
+
):
|
|
75
|
+
# This is an MCP with InputFieldsClass, skip it
|
|
76
|
+
return wu
|
|
77
|
+
|
|
78
|
+
if hasattr(wu.metadata, "proposedSnapshot"):
|
|
79
|
+
snapshot = wu.metadata.proposedSnapshot
|
|
80
|
+
if hasattr(snapshot, "aspects"):
|
|
81
|
+
snapshot.aspects = [
|
|
82
|
+
aspect
|
|
83
|
+
for aspect in snapshot.aspects
|
|
84
|
+
if not isinstance(aspect, InputFieldsClass)
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
return wu
|
|
@@ -1,12 +1,16 @@
|
|
|
1
|
+
# So that SourceCapabilityModifier can be resolved at runtime
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
1
4
|
from dataclasses import dataclass
|
|
2
5
|
from enum import Enum, auto
|
|
3
|
-
from typing import Callable, Dict, Optional, Type
|
|
6
|
+
from typing import Callable, Dict, List, Optional, Type
|
|
4
7
|
|
|
5
8
|
from datahub.ingestion.api.common import PipelineContext
|
|
6
9
|
from datahub.ingestion.api.source import (
|
|
7
10
|
Source,
|
|
8
11
|
SourceCapability as SourceCapability,
|
|
9
12
|
)
|
|
13
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
def config_class(config_cls: Type) -> Callable[[Type], Type]:
|
|
@@ -88,10 +92,14 @@ class CapabilitySetting:
|
|
|
88
92
|
capability: SourceCapability
|
|
89
93
|
description: str
|
|
90
94
|
supported: bool
|
|
95
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None
|
|
91
96
|
|
|
92
97
|
|
|
93
98
|
def capability(
|
|
94
|
-
capability_name: SourceCapability,
|
|
99
|
+
capability_name: SourceCapability,
|
|
100
|
+
description: str,
|
|
101
|
+
supported: bool = True,
|
|
102
|
+
subtype_modifier: Optional[List[SourceCapabilityModifier]] = None,
|
|
95
103
|
) -> Callable[[Type], Type]:
|
|
96
104
|
"""
|
|
97
105
|
A decorator to mark a source as having a certain capability
|
|
@@ -104,6 +112,7 @@ def capability(
|
|
|
104
112
|
for base in cls.__bases__
|
|
105
113
|
):
|
|
106
114
|
cls.__capabilities = {}
|
|
115
|
+
|
|
107
116
|
cls.get_capabilities = lambda: cls.__capabilities.values()
|
|
108
117
|
|
|
109
118
|
# If the superclasses have capability annotations, copy those over.
|
|
@@ -113,7 +122,10 @@ def capability(
|
|
|
113
122
|
cls.__capabilities.update(base_caps)
|
|
114
123
|
|
|
115
124
|
cls.__capabilities[capability_name] = CapabilitySetting(
|
|
116
|
-
capability=capability_name,
|
|
125
|
+
capability=capability_name,
|
|
126
|
+
description=description,
|
|
127
|
+
supported=supported,
|
|
128
|
+
subtype_modifier=subtype_modifier,
|
|
117
129
|
)
|
|
118
130
|
return cls
|
|
119
131
|
|