acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -244,3 +244,24 @@ class RedundantUsageRunSkipHandler(RedundantRunSkipHandler):
|
|
|
244
244
|
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
245
245
|
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
246
246
|
cur_state.bucket_duration = bucket_duration
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class RedundantQueriesRunSkipHandler(RedundantRunSkipHandler):
|
|
250
|
+
"""
|
|
251
|
+
Handler for stateful ingestion of queries v2 extraction.
|
|
252
|
+
Manages the time window for audit log extraction that combines
|
|
253
|
+
lineage, usage, operations, and queries.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def get_job_name_suffix(self):
|
|
257
|
+
return "_audit_window"
|
|
258
|
+
|
|
259
|
+
def update_state(
|
|
260
|
+
self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration
|
|
261
|
+
) -> None:
|
|
262
|
+
cur_checkpoint = self.get_current_checkpoint()
|
|
263
|
+
if cur_checkpoint:
|
|
264
|
+
cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state)
|
|
265
|
+
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
266
|
+
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
267
|
+
cur_state.bucket_duration = bucket_duration
|
|
@@ -10,6 +10,7 @@ from datahub.configuration.common import (
|
|
|
10
10
|
ConfigModel,
|
|
11
11
|
ConfigurationError,
|
|
12
12
|
DynamicTypedConfig,
|
|
13
|
+
HiddenFromDocs,
|
|
13
14
|
)
|
|
14
15
|
from datahub.configuration.pydantic_migration_helpers import GenericModel
|
|
15
16
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
@@ -55,25 +56,21 @@ class StatefulIngestionConfig(ConfigModel):
|
|
|
55
56
|
description="Whether or not to enable stateful ingest. "
|
|
56
57
|
"Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
|
|
57
58
|
)
|
|
58
|
-
max_checkpoint_state_size: pydantic.PositiveInt = Field(
|
|
59
|
+
max_checkpoint_state_size: HiddenFromDocs[pydantic.PositiveInt] = Field(
|
|
59
60
|
default=2**24, # 16 MB
|
|
60
61
|
description="The maximum size of the checkpoint state in bytes. Default is 16MB",
|
|
61
|
-
hidden_from_docs=True,
|
|
62
62
|
)
|
|
63
|
-
state_provider: Optional[DynamicTypedStateProviderConfig] = Field(
|
|
63
|
+
state_provider: HiddenFromDocs[Optional[DynamicTypedStateProviderConfig]] = Field(
|
|
64
64
|
default=None,
|
|
65
65
|
description="The ingestion state provider configuration.",
|
|
66
|
-
hidden_from_docs=True,
|
|
67
66
|
)
|
|
68
|
-
ignore_old_state: bool = Field(
|
|
67
|
+
ignore_old_state: HiddenFromDocs[bool] = Field(
|
|
69
68
|
default=False,
|
|
70
69
|
description="If set to True, ignores the previous checkpoint state.",
|
|
71
|
-
hidden_from_docs=True,
|
|
72
70
|
)
|
|
73
|
-
ignore_new_state: bool = Field(
|
|
71
|
+
ignore_new_state: HiddenFromDocs[bool] = Field(
|
|
74
72
|
default=False,
|
|
75
73
|
description="If set to True, ignores the current checkpoint state.",
|
|
76
|
-
hidden_from_docs=True,
|
|
77
74
|
)
|
|
78
75
|
|
|
79
76
|
@pydantic.root_validator(skip_on_failure=True)
|
|
@@ -104,7 +101,9 @@ class StatefulLineageConfigMixin(ConfigModel):
|
|
|
104
101
|
default=True,
|
|
105
102
|
description="Enable stateful lineage ingestion."
|
|
106
103
|
" This will store lineage window timestamps after successful lineage ingestion. "
|
|
107
|
-
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
104
|
+
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
105
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
106
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
108
107
|
)
|
|
109
108
|
|
|
110
109
|
_store_last_lineage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -153,7 +152,9 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
153
152
|
default=True,
|
|
154
153
|
description="Enable stateful lineage ingestion."
|
|
155
154
|
" This will store usage window timestamps after successful usage ingestion. "
|
|
156
|
-
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
155
|
+
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
156
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
157
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
157
158
|
)
|
|
158
159
|
|
|
159
160
|
_store_last_usage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -172,6 +173,30 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
172
173
|
return values
|
|
173
174
|
|
|
174
175
|
|
|
176
|
+
class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
177
|
+
enable_stateful_time_window: bool = Field(
|
|
178
|
+
default=False,
|
|
179
|
+
description="Enable stateful time window tracking."
|
|
180
|
+
" This will store the time window after successful extraction "
|
|
181
|
+
"and adjust the time window in subsequent runs to avoid reprocessing. "
|
|
182
|
+
"NOTE: This is ONLY applicable when using queries v2 (use_queries_v2=True). "
|
|
183
|
+
"This replaces enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion "
|
|
184
|
+
"for the queries v2 extraction path, since queries v2 extracts lineage, usage, operations, "
|
|
185
|
+
"and queries together from a single audit log and uses a unified time window.",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
@root_validator(skip_on_failure=True)
|
|
189
|
+
def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
|
|
190
|
+
sti = values.get("stateful_ingestion")
|
|
191
|
+
if not sti or not sti.enabled:
|
|
192
|
+
if values.get("enable_stateful_time_window"):
|
|
193
|
+
logger.warning(
|
|
194
|
+
"Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
|
|
195
|
+
)
|
|
196
|
+
values["enable_stateful_time_window"] = False
|
|
197
|
+
return values
|
|
198
|
+
|
|
199
|
+
|
|
175
200
|
@dataclass
|
|
176
201
|
class StatefulIngestionReport(SourceReport):
|
|
177
202
|
pass
|
|
@@ -179,7 +204,7 @@ class StatefulIngestionReport(SourceReport):
|
|
|
179
204
|
|
|
180
205
|
@capability(
|
|
181
206
|
SourceCapability.DELETION_DETECTION,
|
|
182
|
-
"
|
|
207
|
+
"Enabled by default via stateful ingestion",
|
|
183
208
|
supported=True,
|
|
184
209
|
)
|
|
185
210
|
class StatefulIngestionSourceBase(Source):
|
|
@@ -8,9 +8,11 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
|
-
|
|
12
|
-
from pydantic
|
|
11
|
+
import sqlglot
|
|
12
|
+
from pydantic import BaseModel, root_validator, validator
|
|
13
13
|
from pydantic.fields import Field
|
|
14
|
+
from requests.adapters import HTTPAdapter
|
|
15
|
+
from urllib3.util.retry import Retry
|
|
14
16
|
|
|
15
17
|
import datahub.emitter.mce_builder as builder
|
|
16
18
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -75,6 +77,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
75
77
|
SchemaFieldDataType,
|
|
76
78
|
SchemaMetadata,
|
|
77
79
|
StringTypeClass,
|
|
80
|
+
TimeTypeClass,
|
|
78
81
|
)
|
|
79
82
|
from datahub.metadata.schema_classes import (
|
|
80
83
|
AuditStampClass,
|
|
@@ -107,6 +110,12 @@ logger = logging.getLogger(__name__)
|
|
|
107
110
|
|
|
108
111
|
PAGE_SIZE = 25
|
|
109
112
|
|
|
113
|
+
# Retry configuration constants
|
|
114
|
+
RETRY_MAX_TIMES = 3
|
|
115
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
116
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
117
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
118
|
+
|
|
110
119
|
|
|
111
120
|
chart_type_from_viz_type = {
|
|
112
121
|
"line": ChartTypeClass.LINE,
|
|
@@ -131,8 +140,11 @@ FIELD_TYPE_MAPPING = {
|
|
|
131
140
|
"STRING": StringTypeClass,
|
|
132
141
|
"FLOAT": NumberTypeClass,
|
|
133
142
|
"DATETIME": DateTypeClass,
|
|
143
|
+
"TIMESTAMP": TimeTypeClass,
|
|
134
144
|
"BOOLEAN": BooleanTypeClass,
|
|
135
145
|
"SQL": StringTypeClass,
|
|
146
|
+
"NUMERIC": NumberTypeClass,
|
|
147
|
+
"TEXT": StringTypeClass,
|
|
136
148
|
}
|
|
137
149
|
|
|
138
150
|
|
|
@@ -149,6 +161,7 @@ class SupersetDataset(BaseModel):
|
|
|
149
161
|
table_name: str
|
|
150
162
|
changed_on_utc: Optional[str] = None
|
|
151
163
|
explore_url: Optional[str] = ""
|
|
164
|
+
description: Optional[str] = ""
|
|
152
165
|
|
|
153
166
|
@property
|
|
154
167
|
def modified_dt(self) -> Optional[datetime]:
|
|
@@ -272,10 +285,11 @@ def get_filter_name(filter_obj):
|
|
|
272
285
|
@config_class(SupersetConfig)
|
|
273
286
|
@support_status(SupportStatus.CERTIFIED)
|
|
274
287
|
@capability(
|
|
275
|
-
SourceCapability.DELETION_DETECTION, "
|
|
288
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
276
289
|
)
|
|
277
290
|
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
|
278
291
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
|
292
|
+
@capability(SourceCapability.TAGS, "Supported by default")
|
|
279
293
|
class SupersetSource(StatefulIngestionSourceBase):
|
|
280
294
|
"""
|
|
281
295
|
This plugin extracts the following:
|
|
@@ -321,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
321
335
|
logger.debug("Got access token from superset")
|
|
322
336
|
|
|
323
337
|
requests_session = requests.Session()
|
|
338
|
+
|
|
339
|
+
# Configure retry strategy for transient failures
|
|
340
|
+
retry_strategy = Retry(
|
|
341
|
+
total=RETRY_MAX_TIMES,
|
|
342
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
343
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
344
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
345
|
+
raise_on_status=False,
|
|
346
|
+
)
|
|
347
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
348
|
+
requests_session.mount("http://", adapter)
|
|
349
|
+
requests_session.mount("https://", adapter)
|
|
350
|
+
|
|
324
351
|
requests_session.headers.update(
|
|
325
352
|
{
|
|
326
353
|
"Authorization": f"Bearer {self.access_token}",
|
|
@@ -353,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
353
380
|
)
|
|
354
381
|
|
|
355
382
|
if response.status_code != 200:
|
|
356
|
-
|
|
357
|
-
|
|
383
|
+
self.report.warning(
|
|
384
|
+
title="Failed to fetch data from Superset API",
|
|
385
|
+
message="Incomplete metadata extraction due to Superset API failure",
|
|
386
|
+
context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
|
|
387
|
+
)
|
|
388
|
+
# we stop pagination for this entity type and we continue the overall ingestion
|
|
389
|
+
break
|
|
358
390
|
|
|
359
391
|
payload = response.json()
|
|
360
392
|
# Update total_items with the actual count from the response
|
|
@@ -515,6 +547,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
515
547
|
)
|
|
516
548
|
dashboard_snapshot.aspects.append(owners_info)
|
|
517
549
|
|
|
550
|
+
superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
|
|
551
|
+
tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
|
|
552
|
+
if tags:
|
|
553
|
+
dashboard_snapshot.aspects.append(tags)
|
|
554
|
+
|
|
518
555
|
return dashboard_snapshot
|
|
519
556
|
|
|
520
557
|
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
@@ -633,62 +670,130 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
633
670
|
|
|
634
671
|
return input_fields
|
|
635
672
|
|
|
636
|
-
def
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
datasource_urn: Union[str, None],
|
|
640
|
-
datasource_id: Union[Any, int],
|
|
641
|
-
) -> List[InputField]:
|
|
642
|
-
column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
|
|
643
|
-
"all_columns", []
|
|
644
|
-
)
|
|
645
|
-
|
|
646
|
-
# the second field represents whether its a SQL expression,
|
|
647
|
-
# false being just regular column and true being SQL col
|
|
648
|
-
chart_column_data: List[Tuple[str, bool]] = [
|
|
649
|
-
(column, False)
|
|
650
|
-
if isinstance(column, str)
|
|
651
|
-
else (column.get("label", ""), True)
|
|
652
|
-
for column in column_data
|
|
653
|
-
]
|
|
673
|
+
def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
|
|
674
|
+
if not sql_expr:
|
|
675
|
+
return []
|
|
654
676
|
|
|
655
|
-
|
|
677
|
+
try:
|
|
678
|
+
parsed_expr = sqlglot.parse_one(sql_expr)
|
|
656
679
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
680
|
+
column_refs = set()
|
|
681
|
+
for node in parsed_expr.walk():
|
|
682
|
+
if isinstance(node, sqlglot.exp.Column):
|
|
683
|
+
column_name = node.name
|
|
684
|
+
column_refs.add(column_name)
|
|
661
685
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
686
|
+
return list(column_refs)
|
|
687
|
+
except Exception as e:
|
|
688
|
+
self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
|
|
689
|
+
return []
|
|
666
690
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
continue
|
|
691
|
+
def _process_column_item(
|
|
692
|
+
self, item: Union[str, dict], unique_columns: Dict[str, bool]
|
|
693
|
+
) -> None:
|
|
694
|
+
"""Process a single column item and add to unique_columns."""
|
|
672
695
|
|
|
673
|
-
|
|
696
|
+
def add_column(col_name: str, is_sql: bool) -> None:
|
|
697
|
+
if not col_name:
|
|
698
|
+
return
|
|
699
|
+
# Always set to False if any non-SQL seen, else keep as is_sql
|
|
700
|
+
unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
|
|
701
|
+
|
|
702
|
+
if isinstance(item, str):
|
|
703
|
+
add_column(item, False)
|
|
704
|
+
elif isinstance(item, dict):
|
|
705
|
+
if item.get("expressionType") == "SIMPLE":
|
|
706
|
+
# For metrics with SIMPLE expression type
|
|
707
|
+
add_column(item.get("column", {}).get("column_name", ""), False)
|
|
708
|
+
elif item.get("expressionType") == "SQL":
|
|
709
|
+
sql_expr = item.get("sqlExpression")
|
|
710
|
+
column_refs = self._extract_columns_from_sql(sql_expr)
|
|
711
|
+
for col in column_refs:
|
|
712
|
+
add_column(col, False)
|
|
713
|
+
if not column_refs:
|
|
714
|
+
add_column(item.get("label", ""), True)
|
|
715
|
+
|
|
716
|
+
def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
|
|
717
|
+
"""Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
|
|
718
|
+
unique_columns: Dict[str, bool] = {}
|
|
719
|
+
|
|
720
|
+
# Process regular columns
|
|
721
|
+
for column in form_data.get("all_columns", []):
|
|
722
|
+
self._process_column_item(column, unique_columns)
|
|
723
|
+
|
|
724
|
+
# Process metrics
|
|
725
|
+
# For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
|
|
726
|
+
# For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
|
|
727
|
+
if "metric" in form_data:
|
|
728
|
+
metrics_data = [form_data.get("metric")]
|
|
674
729
|
else:
|
|
675
|
-
|
|
730
|
+
metrics_data = form_data.get("metrics", [])
|
|
731
|
+
|
|
732
|
+
for metric in metrics_data:
|
|
733
|
+
if metric is not None:
|
|
734
|
+
self._process_column_item(metric, unique_columns)
|
|
735
|
+
|
|
736
|
+
# Process group by columns
|
|
737
|
+
for group in form_data.get("groupby", []):
|
|
738
|
+
self._process_column_item(group, unique_columns)
|
|
739
|
+
|
|
740
|
+
# Process x-axis columns
|
|
741
|
+
x_axis_data = form_data.get("x_axis")
|
|
742
|
+
if x_axis_data is not None:
|
|
743
|
+
self._process_column_item(x_axis_data, unique_columns)
|
|
744
|
+
|
|
745
|
+
return unique_columns
|
|
746
|
+
|
|
747
|
+
def _fetch_dataset_columns(
|
|
748
|
+
self, datasource_id: Union[Any, int]
|
|
749
|
+
) -> List[Tuple[str, str, str]]:
|
|
750
|
+
"""Fetch dataset columns and metrics from Superset API."""
|
|
751
|
+
if not datasource_id:
|
|
676
752
|
logger.warning(
|
|
677
753
|
"no datasource id was found, cannot build column level lineage"
|
|
678
754
|
)
|
|
679
755
|
return []
|
|
680
756
|
|
|
757
|
+
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
758
|
+
dataset_column_info = dataset_info.get("columns", [])
|
|
759
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
760
|
+
|
|
761
|
+
dataset_columns: List[Tuple[str, str, str]] = []
|
|
762
|
+
for column in dataset_column_info:
|
|
763
|
+
col_name = column.get("column_name", "")
|
|
764
|
+
col_type = column.get("type", "")
|
|
765
|
+
col_description = column.get("description", "")
|
|
766
|
+
|
|
767
|
+
if col_name == "" or col_type == "":
|
|
768
|
+
logger.info(f"could not construct column lineage for {column}")
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
dataset_columns.append((col_name, col_type, col_description))
|
|
772
|
+
|
|
773
|
+
for metric in dataset_metric_info:
|
|
774
|
+
metric_name = metric.get("metric_name", "")
|
|
775
|
+
metric_type = metric.get("metric_type", "")
|
|
776
|
+
metric_description = metric.get("description", "")
|
|
777
|
+
|
|
778
|
+
if metric_name == "" or metric_type == "":
|
|
779
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
783
|
+
|
|
784
|
+
return dataset_columns
|
|
785
|
+
|
|
786
|
+
def _match_chart_columns_with_dataset(
|
|
787
|
+
self,
|
|
788
|
+
unique_chart_columns: Dict[str, bool],
|
|
789
|
+
dataset_columns: List[Tuple[str, str, str]],
|
|
790
|
+
) -> List[Tuple[str, str, str]]:
|
|
791
|
+
"""Match chart columns with dataset columns, preserving SQL/non-SQL status."""
|
|
681
792
|
chart_columns: List[Tuple[str, str, str]] = []
|
|
682
|
-
|
|
683
|
-
|
|
793
|
+
|
|
794
|
+
for chart_col_name, is_sql in unique_chart_columns.items():
|
|
684
795
|
if is_sql:
|
|
685
|
-
chart_columns.append(
|
|
686
|
-
(
|
|
687
|
-
chart_col_name,
|
|
688
|
-
"SQL",
|
|
689
|
-
"",
|
|
690
|
-
)
|
|
691
|
-
)
|
|
796
|
+
chart_columns.append((chart_col_name, "SQL", ""))
|
|
692
797
|
continue
|
|
693
798
|
|
|
694
799
|
# find matching upstream column
|
|
@@ -699,13 +804,36 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
699
804
|
if dataset_col_name == chart_col_name:
|
|
700
805
|
chart_columns.append(
|
|
701
806
|
(chart_col_name, dataset_col_type, dataset_col_description)
|
|
702
|
-
)
|
|
807
|
+
)
|
|
703
808
|
break
|
|
704
|
-
|
|
705
|
-
# if no matching upstream column was found
|
|
706
|
-
if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
|
|
809
|
+
else:
|
|
707
810
|
chart_columns.append((chart_col_name, "", ""))
|
|
708
811
|
|
|
812
|
+
return chart_columns
|
|
813
|
+
|
|
814
|
+
def construct_chart_cll(
|
|
815
|
+
self,
|
|
816
|
+
chart_data: dict,
|
|
817
|
+
datasource_urn: Union[str, None],
|
|
818
|
+
datasource_id: Union[Any, int],
|
|
819
|
+
) -> List[InputField]:
|
|
820
|
+
"""Construct column-level lineage for a chart."""
|
|
821
|
+
form_data = chart_data.get("form_data", {})
|
|
822
|
+
|
|
823
|
+
# Extract and process all columns in one go
|
|
824
|
+
unique_columns = self._collect_all_unique_columns(form_data)
|
|
825
|
+
|
|
826
|
+
# Fetch dataset columns
|
|
827
|
+
dataset_columns = self._fetch_dataset_columns(datasource_id)
|
|
828
|
+
if not dataset_columns:
|
|
829
|
+
return []
|
|
830
|
+
|
|
831
|
+
# Match chart columns with dataset columns
|
|
832
|
+
chart_columns = self._match_chart_columns_with_dataset(
|
|
833
|
+
unique_columns, dataset_columns
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Build input fields
|
|
709
837
|
return self.build_input_fields(chart_columns, datasource_urn)
|
|
710
838
|
|
|
711
839
|
def construct_chart_from_chart_data(
|
|
@@ -822,6 +950,12 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
822
950
|
lastModified=last_modified,
|
|
823
951
|
)
|
|
824
952
|
chart_snapshot.aspects.append(owners_info)
|
|
953
|
+
|
|
954
|
+
superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
|
|
955
|
+
tags = self._merge_tags_with_existing(chart_urn, superset_tags)
|
|
956
|
+
if tags:
|
|
957
|
+
chart_snapshot.aspects.append(tags)
|
|
958
|
+
|
|
825
959
|
yield MetadataWorkUnit(
|
|
826
960
|
id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
827
961
|
)
|
|
@@ -966,7 +1100,27 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
966
1100
|
fieldPath=col.get("column_name", ""),
|
|
967
1101
|
type=SchemaFieldDataType(data_type),
|
|
968
1102
|
nativeDataType="",
|
|
969
|
-
description=col.get("column_name", ""),
|
|
1103
|
+
description=col.get("description") or col.get("column_name", ""),
|
|
1104
|
+
nullable=True,
|
|
1105
|
+
)
|
|
1106
|
+
schema_fields.append(field)
|
|
1107
|
+
return schema_fields
|
|
1108
|
+
|
|
1109
|
+
def gen_metric_schema_fields(
|
|
1110
|
+
self, metric_data: List[Dict[str, Any]]
|
|
1111
|
+
) -> List[SchemaField]:
|
|
1112
|
+
schema_fields: List[SchemaField] = []
|
|
1113
|
+
for metric in metric_data:
|
|
1114
|
+
metric_type = metric.get("metric_type", "")
|
|
1115
|
+
data_type = resolve_sql_type(metric_type)
|
|
1116
|
+
if data_type is None:
|
|
1117
|
+
data_type = NullType()
|
|
1118
|
+
|
|
1119
|
+
field = SchemaField(
|
|
1120
|
+
fieldPath=metric.get("metric_name", ""),
|
|
1121
|
+
type=SchemaFieldDataType(data_type),
|
|
1122
|
+
nativeDataType=metric_type or "",
|
|
1123
|
+
description=metric.get("description", ""),
|
|
970
1124
|
nullable=True,
|
|
971
1125
|
)
|
|
972
1126
|
schema_fields.append(field)
|
|
@@ -978,13 +1132,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
978
1132
|
) -> SchemaMetadata:
|
|
979
1133
|
dataset_response = dataset_response.get("result", {})
|
|
980
1134
|
column_data = dataset_response.get("columns", [])
|
|
1135
|
+
metric_data = dataset_response.get("metrics", [])
|
|
1136
|
+
|
|
1137
|
+
column_fields = self.gen_schema_fields(column_data)
|
|
1138
|
+
metric_fields = self.gen_metric_schema_fields(metric_data)
|
|
1139
|
+
|
|
981
1140
|
schema_metadata = SchemaMetadata(
|
|
982
1141
|
schemaName=dataset_response.get("table_name", ""),
|
|
983
1142
|
platform=make_data_platform_urn(self.platform),
|
|
984
1143
|
version=0,
|
|
985
1144
|
hash="",
|
|
986
1145
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
987
|
-
fields=
|
|
1146
|
+
fields=column_fields + metric_fields,
|
|
988
1147
|
)
|
|
989
1148
|
return schema_metadata
|
|
990
1149
|
|
|
@@ -1049,6 +1208,8 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1049
1208
|
# To generate column level lineage, we can manually decode the metadata
|
|
1050
1209
|
# to produce the ColumnLineageInfo
|
|
1051
1210
|
columns = dataset_response.get("result", {}).get("columns", [])
|
|
1211
|
+
metrics = dataset_response.get("result", {}).get("metrics", [])
|
|
1212
|
+
|
|
1052
1213
|
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1053
1214
|
|
|
1054
1215
|
for column in columns:
|
|
@@ -1067,6 +1228,22 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1067
1228
|
)
|
|
1068
1229
|
)
|
|
1069
1230
|
|
|
1231
|
+
for metric in metrics:
|
|
1232
|
+
metric_name = metric.get("metric_name", "")
|
|
1233
|
+
if not metric_name:
|
|
1234
|
+
continue
|
|
1235
|
+
|
|
1236
|
+
downstream = [make_schema_field_urn(datasource_urn, metric_name)]
|
|
1237
|
+
upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
|
|
1238
|
+
fine_grained_lineages.append(
|
|
1239
|
+
FineGrainedLineageClass(
|
|
1240
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1241
|
+
downstreams=downstream,
|
|
1242
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1243
|
+
upstreams=upstreams,
|
|
1244
|
+
)
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1070
1247
|
upstream_lineage = UpstreamLineageClass(
|
|
1071
1248
|
upstreams=[
|
|
1072
1249
|
UpstreamClass(
|
|
@@ -1087,7 +1264,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1087
1264
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
1088
1265
|
dataset_response, self.platform
|
|
1089
1266
|
)
|
|
1090
|
-
dataset_url = f"{self.config.display_uri}{
|
|
1267
|
+
dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
|
|
1091
1268
|
|
|
1092
1269
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
1093
1270
|
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
@@ -1144,21 +1321,22 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1144
1321
|
|
|
1145
1322
|
dataset_info = DatasetPropertiesClass(
|
|
1146
1323
|
name=dataset.table_name,
|
|
1147
|
-
description="",
|
|
1324
|
+
description=dataset.description or "",
|
|
1148
1325
|
externalUrl=dataset_url,
|
|
1149
1326
|
lastModified=TimeStamp(time=modified_ts),
|
|
1150
1327
|
)
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1328
|
+
|
|
1329
|
+
dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
1330
|
+
tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
|
|
1331
|
+
|
|
1332
|
+
aspects_items: List[Any] = [
|
|
1333
|
+
self.gen_schema_metadata(dataset_response),
|
|
1334
|
+
dataset_info,
|
|
1335
|
+
upstream_lineage,
|
|
1336
|
+
]
|
|
1337
|
+
|
|
1338
|
+
if tags:
|
|
1339
|
+
aspects_items.append(tags)
|
|
1162
1340
|
|
|
1163
1341
|
dataset_snapshot = DatasetSnapshot(
|
|
1164
1342
|
urn=datasource_urn,
|
|
@@ -1180,6 +1358,75 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1180
1358
|
|
|
1181
1359
|
return dataset_snapshot
|
|
1182
1360
|
|
|
1361
|
+
def _extract_and_map_tags(
|
|
1362
|
+
self, raw_tags: List[Dict[str, Any]]
|
|
1363
|
+
) -> Optional[GlobalTagsClass]:
|
|
1364
|
+
"""Extract and map Superset tags to DataHub GlobalTagsClass.
|
|
1365
|
+
|
|
1366
|
+
Filters out system-generated tags (type != 1) and only processes user-defined tags
|
|
1367
|
+
from the Superset API response.
|
|
1368
|
+
|
|
1369
|
+
Args:
|
|
1370
|
+
raw_tags: List of tag dictionaries from Superset API
|
|
1371
|
+
|
|
1372
|
+
Returns:
|
|
1373
|
+
GlobalTagsClass with user-defined tags, or None if no tags found
|
|
1374
|
+
"""
|
|
1375
|
+
user_tags = [
|
|
1376
|
+
tag.get("name", "")
|
|
1377
|
+
for tag in raw_tags
|
|
1378
|
+
if tag.get("type") == 1 and tag.get("name")
|
|
1379
|
+
]
|
|
1380
|
+
|
|
1381
|
+
if not user_tags:
|
|
1382
|
+
return None
|
|
1383
|
+
|
|
1384
|
+
tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
|
|
1385
|
+
return GlobalTagsClass(
|
|
1386
|
+
tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
def _merge_tags_with_existing(
|
|
1390
|
+
self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
|
|
1391
|
+
) -> Optional[GlobalTagsClass]:
|
|
1392
|
+
"""Merge new tags with existing ones from DataHub to preserve manually added tags.
|
|
1393
|
+
|
|
1394
|
+
This method ensures that tags manually added via DataHub UI are not overwritten
|
|
1395
|
+
during ingestion. It fetches existing tags from the graph and merges them with
|
|
1396
|
+
new tags from the source system, avoiding duplicates.
|
|
1397
|
+
|
|
1398
|
+
Args:
|
|
1399
|
+
entity_urn: URN of the entity to check for existing tags
|
|
1400
|
+
new_tags: New tags to add as GlobalTagsClass object
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
GlobalTagsClass with merged tags preserving existing ones, or None if no tags
|
|
1404
|
+
"""
|
|
1405
|
+
if not new_tags or not new_tags.tags:
|
|
1406
|
+
return None
|
|
1407
|
+
|
|
1408
|
+
# Fetch existing tags from DataHub
|
|
1409
|
+
existing_global_tags = None
|
|
1410
|
+
if self.ctx.graph:
|
|
1411
|
+
existing_global_tags = self.ctx.graph.get_aspect(
|
|
1412
|
+
entity_urn=entity_urn, aspect_type=GlobalTagsClass
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
# Merge existing tags with new ones, avoiding duplicates
|
|
1416
|
+
all_tags = []
|
|
1417
|
+
existing_tag_urns = set()
|
|
1418
|
+
|
|
1419
|
+
if existing_global_tags and existing_global_tags.tags:
|
|
1420
|
+
all_tags.extend(existing_global_tags.tags)
|
|
1421
|
+
existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
|
|
1422
|
+
|
|
1423
|
+
# Add new tags that don't already exist
|
|
1424
|
+
for new_tag in new_tags.tags:
|
|
1425
|
+
if new_tag.tag not in existing_tag_urns:
|
|
1426
|
+
all_tags.append(new_tag)
|
|
1427
|
+
|
|
1428
|
+
return GlobalTagsClass(tags=all_tags) if all_tags else None
|
|
1429
|
+
|
|
1183
1430
|
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1184
1431
|
dataset_name = ""
|
|
1185
1432
|
try:
|