acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from functools import lru_cache
|
|
6
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
8
|
|
|
8
9
|
import dateutil.parser as dp
|
|
9
10
|
import requests
|
|
10
|
-
|
|
11
|
-
from pydantic
|
|
11
|
+
import sqlglot
|
|
12
|
+
from pydantic import BaseModel, root_validator, validator
|
|
12
13
|
from pydantic.fields import Field
|
|
14
|
+
from requests.adapters import HTTPAdapter
|
|
15
|
+
from urllib3.util.retry import Retry
|
|
13
16
|
|
|
17
|
+
import datahub.emitter.mce_builder as builder
|
|
14
18
|
from datahub.configuration.common import AllowDenyPattern
|
|
15
19
|
from datahub.configuration.source_common import (
|
|
16
20
|
EnvConfigMixin,
|
|
@@ -23,8 +27,10 @@ from datahub.emitter.mce_builder import (
|
|
|
23
27
|
make_dataset_urn,
|
|
24
28
|
make_dataset_urn_with_platform_instance,
|
|
25
29
|
make_domain_urn,
|
|
30
|
+
make_schema_field_urn,
|
|
26
31
|
make_user_urn,
|
|
27
32
|
)
|
|
33
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
34
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
29
35
|
from datahub.ingestion.api.common import PipelineContext
|
|
30
36
|
from datahub.ingestion.api.decorators import (
|
|
@@ -49,6 +55,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
49
55
|
)
|
|
50
56
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
51
57
|
ChangeAuditStamps,
|
|
58
|
+
InputField,
|
|
59
|
+
InputFields,
|
|
52
60
|
Status,
|
|
53
61
|
TimeStamp,
|
|
54
62
|
)
|
|
@@ -59,11 +67,17 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
59
67
|
)
|
|
60
68
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
61
69
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
70
|
+
BooleanTypeClass,
|
|
71
|
+
DateTypeClass,
|
|
62
72
|
MySqlDDL,
|
|
63
73
|
NullType,
|
|
74
|
+
NullTypeClass,
|
|
75
|
+
NumberTypeClass,
|
|
64
76
|
SchemaField,
|
|
65
77
|
SchemaFieldDataType,
|
|
66
78
|
SchemaMetadata,
|
|
79
|
+
StringTypeClass,
|
|
80
|
+
TimeTypeClass,
|
|
67
81
|
)
|
|
68
82
|
from datahub.metadata.schema_classes import (
|
|
69
83
|
AuditStampClass,
|
|
@@ -72,6 +86,9 @@ from datahub.metadata.schema_classes import (
|
|
|
72
86
|
DashboardInfoClass,
|
|
73
87
|
DatasetLineageTypeClass,
|
|
74
88
|
DatasetPropertiesClass,
|
|
89
|
+
FineGrainedLineageClass,
|
|
90
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
91
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
75
92
|
GlobalTagsClass,
|
|
76
93
|
OwnerClass,
|
|
77
94
|
OwnershipClass,
|
|
@@ -80,14 +97,25 @@ from datahub.metadata.schema_classes import (
|
|
|
80
97
|
UpstreamClass,
|
|
81
98
|
UpstreamLineageClass,
|
|
82
99
|
)
|
|
100
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
101
|
+
SqlParsingResult,
|
|
102
|
+
create_lineage_sql_parsed_result,
|
|
103
|
+
)
|
|
83
104
|
from datahub.utilities import config_clean
|
|
84
105
|
from datahub.utilities.lossy_collections import LossyList
|
|
85
106
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
107
|
+
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
86
108
|
|
|
87
109
|
logger = logging.getLogger(__name__)
|
|
88
110
|
|
|
89
111
|
PAGE_SIZE = 25
|
|
90
112
|
|
|
113
|
+
# Retry configuration constants
|
|
114
|
+
RETRY_MAX_TIMES = 3
|
|
115
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
116
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
117
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
118
|
+
|
|
91
119
|
|
|
92
120
|
chart_type_from_viz_type = {
|
|
93
121
|
"line": ChartTypeClass.LINE,
|
|
@@ -105,9 +133,20 @@ chart_type_from_viz_type = {
|
|
|
105
133
|
"box_plot": ChartTypeClass.BAR,
|
|
106
134
|
}
|
|
107
135
|
|
|
108
|
-
|
|
109
136
|
platform_without_databases = ["druid"]
|
|
110
137
|
|
|
138
|
+
FIELD_TYPE_MAPPING = {
|
|
139
|
+
"INT": NumberTypeClass,
|
|
140
|
+
"STRING": StringTypeClass,
|
|
141
|
+
"FLOAT": NumberTypeClass,
|
|
142
|
+
"DATETIME": DateTypeClass,
|
|
143
|
+
"TIMESTAMP": TimeTypeClass,
|
|
144
|
+
"BOOLEAN": BooleanTypeClass,
|
|
145
|
+
"SQL": StringTypeClass,
|
|
146
|
+
"NUMERIC": NumberTypeClass,
|
|
147
|
+
"TEXT": StringTypeClass,
|
|
148
|
+
}
|
|
149
|
+
|
|
111
150
|
|
|
112
151
|
@dataclass
|
|
113
152
|
class SupersetSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -122,6 +161,7 @@ class SupersetDataset(BaseModel):
|
|
|
122
161
|
table_name: str
|
|
123
162
|
changed_on_utc: Optional[str] = None
|
|
124
163
|
explore_url: Optional[str] = ""
|
|
164
|
+
description: Optional[str] = ""
|
|
125
165
|
|
|
126
166
|
@property
|
|
127
167
|
def modified_dt(self) -> Optional[datetime]:
|
|
@@ -139,6 +179,7 @@ class SupersetDataset(BaseModel):
|
|
|
139
179
|
class SupersetConfig(
|
|
140
180
|
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
141
181
|
):
|
|
182
|
+
# TODO: Add support for missing dataPlatformInstance/containers
|
|
142
183
|
# See the Superset /security/login endpoint for details
|
|
143
184
|
# https://superset.apache.org/docs/rest-api
|
|
144
185
|
connect_uri: str = Field(
|
|
@@ -150,7 +191,7 @@ class SupersetConfig(
|
|
|
150
191
|
)
|
|
151
192
|
domain: Dict[str, AllowDenyPattern] = Field(
|
|
152
193
|
default=dict(),
|
|
153
|
-
description="
|
|
194
|
+
description="Regex patterns for tables to filter to assign domain_key. ",
|
|
154
195
|
)
|
|
155
196
|
dataset_pattern: AllowDenyPattern = Field(
|
|
156
197
|
default=AllowDenyPattern.allow_all(),
|
|
@@ -164,6 +205,10 @@ class SupersetConfig(
|
|
|
164
205
|
AllowDenyPattern.allow_all(),
|
|
165
206
|
description="Patterns for selecting dashboard names that are to be included",
|
|
166
207
|
)
|
|
208
|
+
database_pattern: AllowDenyPattern = Field(
|
|
209
|
+
default=AllowDenyPattern.allow_all(),
|
|
210
|
+
description="Regex patterns for databases to filter in ingestion.",
|
|
211
|
+
)
|
|
167
212
|
username: Optional[str] = Field(default=None, description="Superset username.")
|
|
168
213
|
password: Optional[str] = Field(default=None, description="Superset password.")
|
|
169
214
|
# Configuration for stateful ingestion
|
|
@@ -181,6 +226,15 @@ class SupersetConfig(
|
|
|
181
226
|
provider: str = Field(default="db", description="Superset provider.")
|
|
182
227
|
options: Dict = Field(default={}, description="")
|
|
183
228
|
|
|
229
|
+
timeout: int = Field(
|
|
230
|
+
default=10, description="Timeout of single API call to superset."
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
max_threads: int = Field(
|
|
234
|
+
default_factory=lambda: os.cpu_count() or 40,
|
|
235
|
+
description="Max parallelism for API calls. Defaults to cpuCount or 40",
|
|
236
|
+
)
|
|
237
|
+
|
|
184
238
|
# TODO: Check and remove this if no longer needed.
|
|
185
239
|
# Config database_alias is removed from sql sources.
|
|
186
240
|
database_alias: Dict[str, str] = Field(
|
|
@@ -231,10 +285,11 @@ def get_filter_name(filter_obj):
|
|
|
231
285
|
@config_class(SupersetConfig)
|
|
232
286
|
@support_status(SupportStatus.CERTIFIED)
|
|
233
287
|
@capability(
|
|
234
|
-
SourceCapability.DELETION_DETECTION, "
|
|
288
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
235
289
|
)
|
|
236
290
|
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
|
237
291
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
|
292
|
+
@capability(SourceCapability.TAGS, "Supported by default")
|
|
238
293
|
class SupersetSource(StatefulIngestionSourceBase):
|
|
239
294
|
"""
|
|
240
295
|
This plugin extracts the following:
|
|
@@ -261,6 +316,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
261
316
|
)
|
|
262
317
|
self.session = self.login()
|
|
263
318
|
self.owner_info = self.parse_owner_info()
|
|
319
|
+
self.filtered_dataset_to_database: Dict[int, str] = {}
|
|
320
|
+
self.filtered_chart_to_database: Dict[int, str] = {}
|
|
321
|
+
self.processed_charts: Dict[int, Tuple[Optional[str], bool]] = {}
|
|
264
322
|
|
|
265
323
|
def login(self) -> requests.Session:
|
|
266
324
|
login_response = requests.post(
|
|
@@ -277,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
277
335
|
logger.debug("Got access token from superset")
|
|
278
336
|
|
|
279
337
|
requests_session = requests.Session()
|
|
338
|
+
|
|
339
|
+
# Configure retry strategy for transient failures
|
|
340
|
+
retry_strategy = Retry(
|
|
341
|
+
total=RETRY_MAX_TIMES,
|
|
342
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
343
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
344
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
345
|
+
raise_on_status=False,
|
|
346
|
+
)
|
|
347
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
348
|
+
requests_session.mount("http://", adapter)
|
|
349
|
+
requests_session.mount("https://", adapter)
|
|
350
|
+
|
|
280
351
|
requests_session.headers.update(
|
|
281
352
|
{
|
|
282
353
|
"Authorization": f"Bearer {self.access_token}",
|
|
@@ -285,13 +356,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
285
356
|
}
|
|
286
357
|
)
|
|
287
358
|
|
|
288
|
-
# Test the connection
|
|
289
359
|
test_response = requests_session.get(
|
|
290
|
-
f"{self.config.connect_uri}/api/v1/dashboard/"
|
|
360
|
+
f"{self.config.connect_uri}/api/v1/dashboard/",
|
|
361
|
+
timeout=self.config.timeout,
|
|
291
362
|
)
|
|
292
|
-
if test_response.status_code
|
|
293
|
-
|
|
294
|
-
#
|
|
363
|
+
if test_response.status_code != 200:
|
|
364
|
+
# throw an error and terminate ingestion,
|
|
365
|
+
# cannot proceed without access token
|
|
366
|
+
logger.error(
|
|
367
|
+
f"Failed to log in to Superset with status: {test_response.status_code}"
|
|
368
|
+
)
|
|
295
369
|
return requests_session
|
|
296
370
|
|
|
297
371
|
def paginate_entity_api_results(self, entity_type, page_size=100):
|
|
@@ -302,10 +376,17 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
302
376
|
response = self.session.get(
|
|
303
377
|
f"{self.config.connect_uri}/api/v1/{entity_type}",
|
|
304
378
|
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
379
|
+
timeout=self.config.timeout,
|
|
305
380
|
)
|
|
306
381
|
|
|
307
382
|
if response.status_code != 200:
|
|
308
|
-
|
|
383
|
+
self.report.warning(
|
|
384
|
+
title="Failed to fetch data from Superset API",
|
|
385
|
+
message="Incomplete metadata extraction due to Superset API failure",
|
|
386
|
+
context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
|
|
387
|
+
)
|
|
388
|
+
# we stop pagination for this entity type and we continue the overall ingestion
|
|
389
|
+
break
|
|
309
390
|
|
|
310
391
|
payload = response.json()
|
|
311
392
|
# Update total_items with the actual count from the response
|
|
@@ -339,10 +420,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
420
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
340
421
|
dataset_response = self.session.get(
|
|
341
422
|
f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
|
|
423
|
+
timeout=self.config.timeout,
|
|
342
424
|
)
|
|
343
425
|
if dataset_response.status_code != 200:
|
|
344
426
|
logger.warning(f"Failed to get dataset info: {dataset_response.text}")
|
|
345
|
-
|
|
427
|
+
return {}
|
|
346
428
|
return dataset_response.json()
|
|
347
429
|
|
|
348
430
|
def get_datasource_urn_from_id(
|
|
@@ -393,8 +475,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
393
475
|
)
|
|
394
476
|
|
|
395
477
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
478
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
396
479
|
modified_ts = int(
|
|
397
|
-
dp.parse(dashboard_data.get("changed_on_utc",
|
|
480
|
+
dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
398
481
|
)
|
|
399
482
|
title = dashboard_data.get("dashboard_title", "")
|
|
400
483
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
@@ -464,37 +547,298 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
464
547
|
)
|
|
465
548
|
dashboard_snapshot.aspects.append(owners_info)
|
|
466
549
|
|
|
550
|
+
superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
|
|
551
|
+
tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
|
|
552
|
+
if tags:
|
|
553
|
+
dashboard_snapshot.aspects.append(tags)
|
|
554
|
+
|
|
467
555
|
return dashboard_snapshot
|
|
468
556
|
|
|
469
|
-
def
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
557
|
+
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
558
|
+
dashboard_title = ""
|
|
559
|
+
try:
|
|
560
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
561
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
562
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
563
|
+
self.report.report_dropped(
|
|
564
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
565
|
+
)
|
|
566
|
+
return
|
|
567
|
+
|
|
568
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
569
|
+
raw_position_data = dashboard_data.get("position_json", "{}")
|
|
570
|
+
position_data = (
|
|
571
|
+
json.loads(raw_position_data)
|
|
572
|
+
if raw_position_data is not None
|
|
573
|
+
else {}
|
|
574
|
+
)
|
|
474
575
|
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
576
|
+
chart_ids = []
|
|
577
|
+
for key, value in position_data.items():
|
|
578
|
+
if not key.startswith("CHART-"):
|
|
579
|
+
continue
|
|
580
|
+
chart_id = value.get("meta", {}).get("chartId")
|
|
581
|
+
if chart_id:
|
|
582
|
+
chart_ids.append(chart_id)
|
|
583
|
+
|
|
584
|
+
for chart_id in chart_ids:
|
|
585
|
+
if chart_id in self.processed_charts:
|
|
586
|
+
database_name, is_filtered = self.processed_charts[chart_id]
|
|
587
|
+
if is_filtered:
|
|
588
|
+
self.report.warning(
|
|
589
|
+
message="Dashboard contains charts using datasets from a filtered database. Set the dashboard pattern to deny ingestion.",
|
|
590
|
+
context=str(
|
|
591
|
+
dict(
|
|
592
|
+
dashboard_id=dashboard_id,
|
|
593
|
+
dashboard_title=dashboard_title,
|
|
594
|
+
chart_id=chart_id,
|
|
595
|
+
database_name=database_name,
|
|
596
|
+
)
|
|
597
|
+
),
|
|
598
|
+
title="Incomplete Ingestion",
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
|
|
602
|
+
|
|
603
|
+
except Exception as e:
|
|
604
|
+
self.report.warning(
|
|
605
|
+
message="Failed to construct dashboard snapshot. This dashboard will not be ingested.",
|
|
606
|
+
context=str(
|
|
607
|
+
dict(
|
|
608
|
+
dashboard_id=dashboard_id,
|
|
609
|
+
dashboard_title=dashboard_title,
|
|
610
|
+
error=str(e),
|
|
478
611
|
)
|
|
479
|
-
|
|
612
|
+
),
|
|
613
|
+
title="Dashboard Construction Failed",
|
|
614
|
+
exc=e,
|
|
615
|
+
)
|
|
616
|
+
return
|
|
480
617
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
618
|
+
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
619
|
+
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
620
|
+
yield from self._get_domain_wu(
|
|
621
|
+
title=dashboard_title, entity_urn=dashboard_snapshot.urn
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
625
|
+
dashboard_data_list = [
|
|
626
|
+
(dashboard_data,)
|
|
627
|
+
for dashboard_data in self.paginate_entity_api_results(
|
|
628
|
+
"dashboard/", PAGE_SIZE
|
|
629
|
+
)
|
|
630
|
+
]
|
|
631
|
+
|
|
632
|
+
yield from ThreadedIteratorExecutor.process(
|
|
633
|
+
worker_func=self._process_dashboard,
|
|
634
|
+
args_list=dashboard_data_list,
|
|
635
|
+
max_workers=self.config.max_threads,
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
def build_input_fields(
|
|
639
|
+
self,
|
|
640
|
+
chart_columns: List[Tuple[str, str, str]],
|
|
641
|
+
datasource_urn: Union[str, None],
|
|
642
|
+
) -> List[InputField]:
|
|
643
|
+
input_fields: List[InputField] = []
|
|
644
|
+
|
|
645
|
+
for column in chart_columns:
|
|
646
|
+
col_name, col_type, description = column
|
|
647
|
+
if not col_type or not datasource_urn:
|
|
488
648
|
continue
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
649
|
+
|
|
650
|
+
type_class = FIELD_TYPE_MAPPING.get(
|
|
651
|
+
col_type.upper(), NullTypeClass
|
|
652
|
+
) # gets the type mapping
|
|
653
|
+
|
|
654
|
+
input_fields.append(
|
|
655
|
+
InputField(
|
|
656
|
+
schemaFieldUrn=builder.make_schema_field_urn(
|
|
657
|
+
parent_urn=str(datasource_urn),
|
|
658
|
+
field_path=col_name,
|
|
659
|
+
),
|
|
660
|
+
schemaField=SchemaField(
|
|
661
|
+
fieldPath=col_name,
|
|
662
|
+
type=SchemaFieldDataType(type=type_class()), # type: ignore
|
|
663
|
+
description=(description if description != "null" else ""),
|
|
664
|
+
nativeDataType=col_type,
|
|
665
|
+
globalTags=None,
|
|
666
|
+
nullable=True,
|
|
667
|
+
),
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
return input_fields
|
|
672
|
+
|
|
673
|
+
def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
|
|
674
|
+
if not sql_expr:
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
try:
|
|
678
|
+
parsed_expr = sqlglot.parse_one(sql_expr)
|
|
679
|
+
|
|
680
|
+
column_refs = set()
|
|
681
|
+
for node in parsed_expr.walk():
|
|
682
|
+
if isinstance(node, sqlglot.exp.Column):
|
|
683
|
+
column_name = node.name
|
|
684
|
+
column_refs.add(column_name)
|
|
685
|
+
|
|
686
|
+
return list(column_refs)
|
|
687
|
+
except Exception as e:
|
|
688
|
+
self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
|
|
689
|
+
return []
|
|
690
|
+
|
|
691
|
+
def _process_column_item(
|
|
692
|
+
self, item: Union[str, dict], unique_columns: Dict[str, bool]
|
|
693
|
+
) -> None:
|
|
694
|
+
"""Process a single column item and add to unique_columns."""
|
|
695
|
+
|
|
696
|
+
def add_column(col_name: str, is_sql: bool) -> None:
|
|
697
|
+
if not col_name:
|
|
698
|
+
return
|
|
699
|
+
# Always set to False if any non-SQL seen, else keep as is_sql
|
|
700
|
+
unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
|
|
701
|
+
|
|
702
|
+
if isinstance(item, str):
|
|
703
|
+
add_column(item, False)
|
|
704
|
+
elif isinstance(item, dict):
|
|
705
|
+
if item.get("expressionType") == "SIMPLE":
|
|
706
|
+
# For metrics with SIMPLE expression type
|
|
707
|
+
add_column(item.get("column", {}).get("column_name", ""), False)
|
|
708
|
+
elif item.get("expressionType") == "SQL":
|
|
709
|
+
sql_expr = item.get("sqlExpression")
|
|
710
|
+
column_refs = self._extract_columns_from_sql(sql_expr)
|
|
711
|
+
for col in column_refs:
|
|
712
|
+
add_column(col, False)
|
|
713
|
+
if not column_refs:
|
|
714
|
+
add_column(item.get("label", ""), True)
|
|
715
|
+
|
|
716
|
+
def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
|
|
717
|
+
"""Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
|
|
718
|
+
unique_columns: Dict[str, bool] = {}
|
|
719
|
+
|
|
720
|
+
# Process regular columns
|
|
721
|
+
for column in form_data.get("all_columns", []):
|
|
722
|
+
self._process_column_item(column, unique_columns)
|
|
723
|
+
|
|
724
|
+
# Process metrics
|
|
725
|
+
# For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
|
|
726
|
+
# For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
|
|
727
|
+
if "metric" in form_data:
|
|
728
|
+
metrics_data = [form_data.get("metric")]
|
|
729
|
+
else:
|
|
730
|
+
metrics_data = form_data.get("metrics", [])
|
|
731
|
+
|
|
732
|
+
for metric in metrics_data:
|
|
733
|
+
if metric is not None:
|
|
734
|
+
self._process_column_item(metric, unique_columns)
|
|
735
|
+
|
|
736
|
+
# Process group by columns
|
|
737
|
+
for group in form_data.get("groupby", []):
|
|
738
|
+
self._process_column_item(group, unique_columns)
|
|
739
|
+
|
|
740
|
+
# Process x-axis columns
|
|
741
|
+
x_axis_data = form_data.get("x_axis")
|
|
742
|
+
if x_axis_data is not None:
|
|
743
|
+
self._process_column_item(x_axis_data, unique_columns)
|
|
744
|
+
|
|
745
|
+
return unique_columns
|
|
746
|
+
|
|
747
|
+
def _fetch_dataset_columns(
|
|
748
|
+
self, datasource_id: Union[Any, int]
|
|
749
|
+
) -> List[Tuple[str, str, str]]:
|
|
750
|
+
"""Fetch dataset columns and metrics from Superset API."""
|
|
751
|
+
if not datasource_id:
|
|
752
|
+
logger.warning(
|
|
753
|
+
"no datasource id was found, cannot build column level lineage"
|
|
495
754
|
)
|
|
755
|
+
return []
|
|
496
756
|
|
|
497
|
-
|
|
757
|
+
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
758
|
+
dataset_column_info = dataset_info.get("columns", [])
|
|
759
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
760
|
+
|
|
761
|
+
dataset_columns: List[Tuple[str, str, str]] = []
|
|
762
|
+
for column in dataset_column_info:
|
|
763
|
+
col_name = column.get("column_name", "")
|
|
764
|
+
col_type = column.get("type", "")
|
|
765
|
+
col_description = column.get("description", "")
|
|
766
|
+
|
|
767
|
+
if col_name == "" or col_type == "":
|
|
768
|
+
logger.info(f"could not construct column lineage for {column}")
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
dataset_columns.append((col_name, col_type, col_description))
|
|
772
|
+
|
|
773
|
+
for metric in dataset_metric_info:
|
|
774
|
+
metric_name = metric.get("metric_name", "")
|
|
775
|
+
metric_type = metric.get("metric_type", "")
|
|
776
|
+
metric_description = metric.get("description", "")
|
|
777
|
+
|
|
778
|
+
if metric_name == "" or metric_type == "":
|
|
779
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
783
|
+
|
|
784
|
+
return dataset_columns
|
|
785
|
+
|
|
786
|
+
def _match_chart_columns_with_dataset(
|
|
787
|
+
self,
|
|
788
|
+
unique_chart_columns: Dict[str, bool],
|
|
789
|
+
dataset_columns: List[Tuple[str, str, str]],
|
|
790
|
+
) -> List[Tuple[str, str, str]]:
|
|
791
|
+
"""Match chart columns with dataset columns, preserving SQL/non-SQL status."""
|
|
792
|
+
chart_columns: List[Tuple[str, str, str]] = []
|
|
793
|
+
|
|
794
|
+
for chart_col_name, is_sql in unique_chart_columns.items():
|
|
795
|
+
if is_sql:
|
|
796
|
+
chart_columns.append((chart_col_name, "SQL", ""))
|
|
797
|
+
continue
|
|
798
|
+
|
|
799
|
+
# find matching upstream column
|
|
800
|
+
for dataset_col in dataset_columns:
|
|
801
|
+
dataset_col_name, dataset_col_type, dataset_col_description = (
|
|
802
|
+
dataset_col
|
|
803
|
+
)
|
|
804
|
+
if dataset_col_name == chart_col_name:
|
|
805
|
+
chart_columns.append(
|
|
806
|
+
(chart_col_name, dataset_col_type, dataset_col_description)
|
|
807
|
+
)
|
|
808
|
+
break
|
|
809
|
+
else:
|
|
810
|
+
chart_columns.append((chart_col_name, "", ""))
|
|
811
|
+
|
|
812
|
+
return chart_columns
|
|
813
|
+
|
|
814
|
+
def construct_chart_cll(
|
|
815
|
+
self,
|
|
816
|
+
chart_data: dict,
|
|
817
|
+
datasource_urn: Union[str, None],
|
|
818
|
+
datasource_id: Union[Any, int],
|
|
819
|
+
) -> List[InputField]:
|
|
820
|
+
"""Construct column-level lineage for a chart."""
|
|
821
|
+
form_data = chart_data.get("form_data", {})
|
|
822
|
+
|
|
823
|
+
# Extract and process all columns in one go
|
|
824
|
+
unique_columns = self._collect_all_unique_columns(form_data)
|
|
825
|
+
|
|
826
|
+
# Fetch dataset columns
|
|
827
|
+
dataset_columns = self._fetch_dataset_columns(datasource_id)
|
|
828
|
+
if not dataset_columns:
|
|
829
|
+
return []
|
|
830
|
+
|
|
831
|
+
# Match chart columns with dataset columns
|
|
832
|
+
chart_columns = self._match_chart_columns_with_dataset(
|
|
833
|
+
unique_columns, dataset_columns
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Build input fields
|
|
837
|
+
return self.build_input_fields(chart_columns, datasource_urn)
|
|
838
|
+
|
|
839
|
+
def construct_chart_from_chart_data(
|
|
840
|
+
self, chart_data: dict
|
|
841
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
498
842
|
chart_urn = make_chart_urn(
|
|
499
843
|
platform=self.platform,
|
|
500
844
|
name=str(chart_data["id"]),
|
|
@@ -506,8 +850,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
506
850
|
)
|
|
507
851
|
|
|
508
852
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
853
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
509
854
|
modified_ts = int(
|
|
510
|
-
dp.parse(chart_data.get("changed_on_utc",
|
|
855
|
+
dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
511
856
|
)
|
|
512
857
|
title = chart_data.get("slice_name", "")
|
|
513
858
|
|
|
@@ -581,6 +926,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
581
926
|
)
|
|
582
927
|
chart_snapshot.aspects.append(chart_info)
|
|
583
928
|
|
|
929
|
+
input_fields = self.construct_chart_cll(
|
|
930
|
+
chart_data, datasource_urn, datasource_id
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
if input_fields:
|
|
934
|
+
yield MetadataChangeProposalWrapper(
|
|
935
|
+
entityUrn=chart_urn,
|
|
936
|
+
aspect=InputFields(
|
|
937
|
+
fields=sorted(input_fields, key=lambda x: x.schemaFieldUrn)
|
|
938
|
+
),
|
|
939
|
+
).as_workunit()
|
|
940
|
+
|
|
584
941
|
chart_owners_list = self.build_owner_urn(chart_data)
|
|
585
942
|
owners_info = OwnershipClass(
|
|
586
943
|
owners=[
|
|
@@ -593,50 +950,143 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
593
950
|
lastModified=last_modified,
|
|
594
951
|
)
|
|
595
952
|
chart_snapshot.aspects.append(owners_info)
|
|
596
|
-
return chart_snapshot
|
|
597
953
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
chart_name = chart_data.get("slice_name", "")
|
|
954
|
+
superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
|
|
955
|
+
tags = self._merge_tags_with_existing(chart_urn, superset_tags)
|
|
956
|
+
if tags:
|
|
957
|
+
chart_snapshot.aspects.append(tags)
|
|
603
958
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
)
|
|
608
|
-
continue
|
|
959
|
+
yield MetadataWorkUnit(
|
|
960
|
+
id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
961
|
+
)
|
|
609
962
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
963
|
+
yield from self._get_domain_wu(
|
|
964
|
+
title=chart_data.get("slice_name", ""),
|
|
965
|
+
entity_urn=chart_urn,
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
969
|
+
chart_name = ""
|
|
970
|
+
database_name = None
|
|
971
|
+
try:
|
|
972
|
+
chart_id = chart_data.get("id")
|
|
973
|
+
chart_name = chart_data.get("slice_name", "")
|
|
974
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
975
|
+
self.report.report_dropped(
|
|
976
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
977
|
+
)
|
|
978
|
+
return
|
|
979
|
+
|
|
980
|
+
# TODO: Make helper methods for database_pattern
|
|
981
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
982
|
+
datasource_id = chart_data.get("datasource_id")
|
|
983
|
+
|
|
984
|
+
if datasource_id:
|
|
985
|
+
if datasource_id in self.filtered_dataset_to_database:
|
|
986
|
+
database_name = self.filtered_dataset_to_database[datasource_id]
|
|
987
|
+
self.filtered_chart_to_database[chart_id] = database_name
|
|
988
|
+
|
|
989
|
+
is_filtered = not self.config.database_pattern.allowed(
|
|
990
|
+
database_name
|
|
617
991
|
)
|
|
992
|
+
self.processed_charts[chart_id] = (database_name, is_filtered)
|
|
618
993
|
|
|
619
|
-
if
|
|
620
|
-
dataset_name
|
|
621
|
-
):
|
|
994
|
+
if is_filtered:
|
|
622
995
|
self.report.warning(
|
|
623
|
-
|
|
996
|
+
message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
|
|
997
|
+
context=str(
|
|
998
|
+
dict(
|
|
999
|
+
chart_id=chart_id,
|
|
1000
|
+
chart_name=chart_name,
|
|
1001
|
+
database_name=database_name,
|
|
1002
|
+
)
|
|
1003
|
+
),
|
|
1004
|
+
title="Incomplete Ingestion",
|
|
624
1005
|
)
|
|
625
1006
|
|
|
626
|
-
|
|
1007
|
+
else:
|
|
1008
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
1009
|
+
database_name = (
|
|
1010
|
+
dataset_response.get("result", {})
|
|
1011
|
+
.get("database", {})
|
|
1012
|
+
.get("database_name")
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
if database_name:
|
|
1016
|
+
is_filtered = not self.config.database_pattern.allowed(
|
|
1017
|
+
database_name
|
|
1018
|
+
)
|
|
1019
|
+
if is_filtered:
|
|
1020
|
+
self.filtered_chart_to_database[chart_id] = (
|
|
1021
|
+
database_name
|
|
1022
|
+
)
|
|
1023
|
+
self.filtered_dataset_to_database[datasource_id] = (
|
|
1024
|
+
database_name
|
|
1025
|
+
)
|
|
1026
|
+
self.processed_charts[chart_id] = (
|
|
1027
|
+
database_name,
|
|
1028
|
+
is_filtered,
|
|
1029
|
+
)
|
|
627
1030
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
1031
|
+
if is_filtered:
|
|
1032
|
+
self.report.warning(
|
|
1033
|
+
message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
|
|
1034
|
+
context=str(
|
|
1035
|
+
dict(
|
|
1036
|
+
chart_id=chart_id,
|
|
1037
|
+
chart_name=chart_name,
|
|
1038
|
+
database_name=database_name,
|
|
1039
|
+
)
|
|
1040
|
+
),
|
|
1041
|
+
title="Incomplete Ingestion",
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
1045
|
+
datasource_id = chart_data.get("datasource_id")
|
|
1046
|
+
if datasource_id:
|
|
1047
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
1048
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
1049
|
+
"table_name", ""
|
|
1050
|
+
)
|
|
1051
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
1052
|
+
dataset_name
|
|
1053
|
+
):
|
|
1054
|
+
self.report.warning(
|
|
1055
|
+
message="Chart uses a dataset that was filtered by dataset pattern. Update your dataset pattern to include this dataset.",
|
|
1056
|
+
context=str(
|
|
1057
|
+
dict(
|
|
1058
|
+
chart_id=chart_id,
|
|
1059
|
+
chart_name=chart_name,
|
|
1060
|
+
dataset_name=dataset_name,
|
|
1061
|
+
)
|
|
1062
|
+
),
|
|
1063
|
+
title="Incomplete Ingestion",
|
|
1064
|
+
)
|
|
1065
|
+
if chart_id not in self.processed_charts:
|
|
1066
|
+
self.processed_charts[chart_id] = (database_name, False)
|
|
1067
|
+
|
|
1068
|
+
yield from self.construct_chart_from_chart_data(chart_data)
|
|
1069
|
+
except Exception as e:
|
|
1070
|
+
self.report.warning(
|
|
1071
|
+
message="Failed to construct chart snapshot. This chart will not be ingested.",
|
|
1072
|
+
context=str(
|
|
1073
|
+
dict(chart_id=chart_id, chart_name=chart_name, error=str(e))
|
|
1074
|
+
),
|
|
1075
|
+
title="Chart Construction Failed",
|
|
1076
|
+
exc=e,
|
|
639
1077
|
)
|
|
1078
|
+
return
|
|
1079
|
+
|
|
1080
|
+
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1081
|
+
chart_data_list = [
|
|
1082
|
+
(chart_data,)
|
|
1083
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
|
|
1084
|
+
]
|
|
1085
|
+
yield from ThreadedIteratorExecutor.process(
|
|
1086
|
+
worker_func=self._process_chart,
|
|
1087
|
+
args_list=chart_data_list,
|
|
1088
|
+
max_workers=self.config.max_threads,
|
|
1089
|
+
)
|
|
640
1090
|
|
|
641
1091
|
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
642
1092
|
schema_fields: List[SchemaField] = []
|
|
@@ -650,7 +1100,27 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
650
1100
|
fieldPath=col.get("column_name", ""),
|
|
651
1101
|
type=SchemaFieldDataType(data_type),
|
|
652
1102
|
nativeDataType="",
|
|
653
|
-
description=col.get("column_name", ""),
|
|
1103
|
+
description=col.get("description") or col.get("column_name", ""),
|
|
1104
|
+
nullable=True,
|
|
1105
|
+
)
|
|
1106
|
+
schema_fields.append(field)
|
|
1107
|
+
return schema_fields
|
|
1108
|
+
|
|
1109
|
+
def gen_metric_schema_fields(
|
|
1110
|
+
self, metric_data: List[Dict[str, Any]]
|
|
1111
|
+
) -> List[SchemaField]:
|
|
1112
|
+
schema_fields: List[SchemaField] = []
|
|
1113
|
+
for metric in metric_data:
|
|
1114
|
+
metric_type = metric.get("metric_type", "")
|
|
1115
|
+
data_type = resolve_sql_type(metric_type)
|
|
1116
|
+
if data_type is None:
|
|
1117
|
+
data_type = NullType()
|
|
1118
|
+
|
|
1119
|
+
field = SchemaField(
|
|
1120
|
+
fieldPath=metric.get("metric_name", ""),
|
|
1121
|
+
type=SchemaFieldDataType(data_type),
|
|
1122
|
+
nativeDataType=metric_type or "",
|
|
1123
|
+
description=metric.get("description", ""),
|
|
654
1124
|
nullable=True,
|
|
655
1125
|
)
|
|
656
1126
|
schema_fields.append(field)
|
|
@@ -662,13 +1132,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
662
1132
|
) -> SchemaMetadata:
|
|
663
1133
|
dataset_response = dataset_response.get("result", {})
|
|
664
1134
|
column_data = dataset_response.get("columns", [])
|
|
1135
|
+
metric_data = dataset_response.get("metrics", [])
|
|
1136
|
+
|
|
1137
|
+
column_fields = self.gen_schema_fields(column_data)
|
|
1138
|
+
metric_fields = self.gen_metric_schema_fields(metric_data)
|
|
1139
|
+
|
|
665
1140
|
schema_metadata = SchemaMetadata(
|
|
666
1141
|
schemaName=dataset_response.get("table_name", ""),
|
|
667
1142
|
platform=make_data_platform_urn(self.platform),
|
|
668
1143
|
version=0,
|
|
669
1144
|
hash="",
|
|
670
1145
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
671
|
-
fields=
|
|
1146
|
+
fields=column_fields + metric_fields,
|
|
672
1147
|
)
|
|
673
1148
|
return schema_metadata
|
|
674
1149
|
|
|
@@ -680,6 +1155,106 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
680
1155
|
env=self.config.env,
|
|
681
1156
|
)
|
|
682
1157
|
|
|
1158
|
+
def generate_virtual_dataset_lineage(
|
|
1159
|
+
self,
|
|
1160
|
+
parsed_query_object: SqlParsingResult,
|
|
1161
|
+
datasource_urn: str,
|
|
1162
|
+
) -> UpstreamLineageClass:
|
|
1163
|
+
cll = (
|
|
1164
|
+
parsed_query_object.column_lineage
|
|
1165
|
+
if parsed_query_object.column_lineage is not None
|
|
1166
|
+
else []
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1170
|
+
|
|
1171
|
+
for cll_info in cll:
|
|
1172
|
+
downstream = (
|
|
1173
|
+
[make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
|
|
1174
|
+
if cll_info.downstream and cll_info.downstream.column
|
|
1175
|
+
else []
|
|
1176
|
+
)
|
|
1177
|
+
upstreams = [
|
|
1178
|
+
make_schema_field_urn(column_ref.table, column_ref.column)
|
|
1179
|
+
for column_ref in cll_info.upstreams
|
|
1180
|
+
]
|
|
1181
|
+
fine_grained_lineages.append(
|
|
1182
|
+
FineGrainedLineageClass(
|
|
1183
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1184
|
+
downstreams=downstream,
|
|
1185
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1186
|
+
upstreams=upstreams,
|
|
1187
|
+
)
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
upstream_lineage = UpstreamLineageClass(
|
|
1191
|
+
upstreams=[
|
|
1192
|
+
UpstreamClass(
|
|
1193
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
1194
|
+
dataset=input_table_urn,
|
|
1195
|
+
)
|
|
1196
|
+
for input_table_urn in parsed_query_object.in_tables
|
|
1197
|
+
],
|
|
1198
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
1199
|
+
)
|
|
1200
|
+
return upstream_lineage
|
|
1201
|
+
|
|
1202
|
+
def generate_physical_dataset_lineage(
|
|
1203
|
+
self,
|
|
1204
|
+
dataset_response: dict,
|
|
1205
|
+
upstream_dataset: str,
|
|
1206
|
+
datasource_urn: str,
|
|
1207
|
+
) -> UpstreamLineageClass:
|
|
1208
|
+
# To generate column level lineage, we can manually decode the metadata
|
|
1209
|
+
# to produce the ColumnLineageInfo
|
|
1210
|
+
columns = dataset_response.get("result", {}).get("columns", [])
|
|
1211
|
+
metrics = dataset_response.get("result", {}).get("metrics", [])
|
|
1212
|
+
|
|
1213
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1214
|
+
|
|
1215
|
+
for column in columns:
|
|
1216
|
+
column_name = column.get("column_name", "")
|
|
1217
|
+
if not column_name:
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
1220
|
+
downstream = [make_schema_field_urn(datasource_urn, column_name)]
|
|
1221
|
+
upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
|
|
1222
|
+
fine_grained_lineages.append(
|
|
1223
|
+
FineGrainedLineageClass(
|
|
1224
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1225
|
+
downstreams=downstream,
|
|
1226
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1227
|
+
upstreams=upstreams,
|
|
1228
|
+
)
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
for metric in metrics:
|
|
1232
|
+
metric_name = metric.get("metric_name", "")
|
|
1233
|
+
if not metric_name:
|
|
1234
|
+
continue
|
|
1235
|
+
|
|
1236
|
+
downstream = [make_schema_field_urn(datasource_urn, metric_name)]
|
|
1237
|
+
upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
|
|
1238
|
+
fine_grained_lineages.append(
|
|
1239
|
+
FineGrainedLineageClass(
|
|
1240
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1241
|
+
downstreams=downstream,
|
|
1242
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1243
|
+
upstreams=upstreams,
|
|
1244
|
+
)
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
upstream_lineage = UpstreamLineageClass(
|
|
1248
|
+
upstreams=[
|
|
1249
|
+
UpstreamClass(
|
|
1250
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
1251
|
+
dataset=upstream_dataset,
|
|
1252
|
+
)
|
|
1253
|
+
],
|
|
1254
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
1255
|
+
)
|
|
1256
|
+
return upstream_lineage
|
|
1257
|
+
|
|
683
1258
|
def construct_dataset_from_dataset_data(
|
|
684
1259
|
self, dataset_data: dict
|
|
685
1260
|
) -> DatasetSnapshot:
|
|
@@ -689,17 +1264,26 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
689
1264
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
690
1265
|
dataset_response, self.platform
|
|
691
1266
|
)
|
|
692
|
-
dataset_url = f"{self.config.display_uri}{
|
|
1267
|
+
dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
|
|
693
1268
|
|
|
694
1269
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
1270
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
695
1271
|
modified_ts = int(
|
|
696
|
-
dp.parse(dataset_data.get("changed_on_utc",
|
|
1272
|
+
dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
697
1273
|
)
|
|
698
1274
|
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
699
1275
|
|
|
700
1276
|
upstream_warehouse_platform = (
|
|
701
1277
|
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
702
1278
|
)
|
|
1279
|
+
upstream_warehouse_db_name = (
|
|
1280
|
+
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
1281
|
+
)
|
|
1282
|
+
|
|
1283
|
+
# if we have rendered sql, we always use that and defualt back to regular sql
|
|
1284
|
+
sql = dataset_response.get("result", {}).get(
|
|
1285
|
+
"rendered_sql"
|
|
1286
|
+
) or dataset_response.get("result", {}).get("sql")
|
|
703
1287
|
|
|
704
1288
|
# Preset has a way of naming their platforms differently than
|
|
705
1289
|
# how datahub names them, so map the platform name to the correct naming
|
|
@@ -712,40 +1296,47 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
712
1296
|
if upstream_warehouse_platform in warehouse_naming:
|
|
713
1297
|
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
714
1298
|
|
|
715
|
-
# TODO: Categorize physical vs virtual upstream dataset
|
|
716
|
-
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
717
|
-
# to differentiate physical vs virtual upstream datasets
|
|
718
|
-
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
719
1299
|
upstream_dataset = self.get_datasource_urn_from_id(
|
|
720
1300
|
dataset_response, upstream_warehouse_platform
|
|
721
1301
|
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
1302
|
+
|
|
1303
|
+
# Sometimes the field will be null instead of not existing
|
|
1304
|
+
if sql == "null" or not sql:
|
|
1305
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
1306
|
+
upstream_lineage = self.generate_physical_dataset_lineage(
|
|
1307
|
+
dataset_response, upstream_dataset, datasource_urn
|
|
1308
|
+
)
|
|
1309
|
+
else:
|
|
1310
|
+
tag_urn = f"urn:li:tag:{self.platform}:virtual"
|
|
1311
|
+
parsed_query_object = create_lineage_sql_parsed_result(
|
|
1312
|
+
query=sql,
|
|
1313
|
+
default_db=upstream_warehouse_db_name,
|
|
1314
|
+
platform=upstream_warehouse_platform,
|
|
1315
|
+
platform_instance=None,
|
|
1316
|
+
env=self.config.env,
|
|
1317
|
+
)
|
|
1318
|
+
upstream_lineage = self.generate_virtual_dataset_lineage(
|
|
1319
|
+
parsed_query_object, datasource_urn
|
|
1320
|
+
)
|
|
731
1321
|
|
|
732
1322
|
dataset_info = DatasetPropertiesClass(
|
|
733
1323
|
name=dataset.table_name,
|
|
734
|
-
description="",
|
|
1324
|
+
description=dataset.description or "",
|
|
735
1325
|
externalUrl=dataset_url,
|
|
736
1326
|
lastModified=TimeStamp(time=modified_ts),
|
|
737
1327
|
)
|
|
738
|
-
global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
739
1328
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
1329
|
+
dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
1330
|
+
tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
|
|
1331
|
+
|
|
1332
|
+
aspects_items: List[Any] = [
|
|
1333
|
+
self.gen_schema_metadata(dataset_response),
|
|
1334
|
+
dataset_info,
|
|
1335
|
+
upstream_lineage,
|
|
1336
|
+
]
|
|
1337
|
+
|
|
1338
|
+
if tags:
|
|
1339
|
+
aspects_items.append(tags)
|
|
749
1340
|
|
|
750
1341
|
dataset_snapshot = DatasetSnapshot(
|
|
751
1342
|
urn=datasource_urn,
|
|
@@ -767,41 +1358,134 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
767
1358
|
|
|
768
1359
|
return dataset_snapshot
|
|
769
1360
|
|
|
770
|
-
def
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
1361
|
+
def _extract_and_map_tags(
|
|
1362
|
+
self, raw_tags: List[Dict[str, Any]]
|
|
1363
|
+
) -> Optional[GlobalTagsClass]:
|
|
1364
|
+
"""Extract and map Superset tags to DataHub GlobalTagsClass.
|
|
774
1365
|
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
continue
|
|
1366
|
+
Filters out system-generated tags (type != 1) and only processes user-defined tags
|
|
1367
|
+
from the Superset API response.
|
|
1368
|
+
|
|
1369
|
+
Args:
|
|
1370
|
+
raw_tags: List of tag dictionaries from Superset API
|
|
781
1371
|
|
|
782
|
-
|
|
783
|
-
|
|
1372
|
+
Returns:
|
|
1373
|
+
GlobalTagsClass with user-defined tags, or None if no tags found
|
|
1374
|
+
"""
|
|
1375
|
+
user_tags = [
|
|
1376
|
+
tag.get("name", "")
|
|
1377
|
+
for tag in raw_tags
|
|
1378
|
+
if tag.get("type") == 1 and tag.get("name")
|
|
1379
|
+
]
|
|
1380
|
+
|
|
1381
|
+
if not user_tags:
|
|
1382
|
+
return None
|
|
1383
|
+
|
|
1384
|
+
tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
|
|
1385
|
+
return GlobalTagsClass(
|
|
1386
|
+
tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
def _merge_tags_with_existing(
|
|
1390
|
+
self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
|
|
1391
|
+
) -> Optional[GlobalTagsClass]:
|
|
1392
|
+
"""Merge new tags with existing ones from DataHub to preserve manually added tags.
|
|
1393
|
+
|
|
1394
|
+
This method ensures that tags manually added via DataHub UI are not overwritten
|
|
1395
|
+
during ingestion. It fetches existing tags from the graph and merges them with
|
|
1396
|
+
new tags from the source system, avoiding duplicates.
|
|
1397
|
+
|
|
1398
|
+
Args:
|
|
1399
|
+
entity_urn: URN of the entity to check for existing tags
|
|
1400
|
+
new_tags: New tags to add as GlobalTagsClass object
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
GlobalTagsClass with merged tags preserving existing ones, or None if no tags
|
|
1404
|
+
"""
|
|
1405
|
+
if not new_tags or not new_tags.tags:
|
|
1406
|
+
return None
|
|
1407
|
+
|
|
1408
|
+
# Fetch existing tags from DataHub
|
|
1409
|
+
existing_global_tags = None
|
|
1410
|
+
if self.ctx.graph:
|
|
1411
|
+
existing_global_tags = self.ctx.graph.get_aspect(
|
|
1412
|
+
entity_urn=entity_urn, aspect_type=GlobalTagsClass
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
# Merge existing tags with new ones, avoiding duplicates
|
|
1416
|
+
all_tags = []
|
|
1417
|
+
existing_tag_urns = set()
|
|
1418
|
+
|
|
1419
|
+
if existing_global_tags and existing_global_tags.tags:
|
|
1420
|
+
all_tags.extend(existing_global_tags.tags)
|
|
1421
|
+
existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
|
|
1422
|
+
|
|
1423
|
+
# Add new tags that don't already exist
|
|
1424
|
+
for new_tag in new_tags.tags:
|
|
1425
|
+
if new_tag.tag not in existing_tag_urns:
|
|
1426
|
+
all_tags.append(new_tag)
|
|
1427
|
+
|
|
1428
|
+
return GlobalTagsClass(tags=all_tags) if all_tags else None
|
|
1429
|
+
|
|
1430
|
+
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1431
|
+
dataset_name = ""
|
|
1432
|
+
try:
|
|
1433
|
+
dataset_id = dataset_data.get("id")
|
|
1434
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
1435
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
1436
|
+
self.report.report_dropped(
|
|
1437
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
784
1438
|
)
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
self.
|
|
788
|
-
|
|
1439
|
+
return
|
|
1440
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
1441
|
+
dataset_response = self.get_dataset_info(dataset_id)
|
|
1442
|
+
database_name = (
|
|
1443
|
+
dataset_response.get("result", {})
|
|
1444
|
+
.get("database", {})
|
|
1445
|
+
.get("database_name")
|
|
789
1446
|
)
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
1447
|
+
|
|
1448
|
+
if database_name and not self.config.database_pattern.allowed(
|
|
1449
|
+
database_name
|
|
1450
|
+
):
|
|
1451
|
+
self.filtered_dataset_to_database[dataset_id] = database_name
|
|
1452
|
+
self.report.report_dropped(
|
|
1453
|
+
f"Dataset '{dataset_name}' filtered by database_pattern with database '{database_name}'"
|
|
1454
|
+
)
|
|
1455
|
+
return
|
|
1456
|
+
|
|
1457
|
+
dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
|
|
1458
|
+
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1459
|
+
except Exception as e:
|
|
1460
|
+
self.report.warning(
|
|
1461
|
+
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
796
1462
|
)
|
|
1463
|
+
return
|
|
1464
|
+
yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
|
|
1465
|
+
yield from self._get_domain_wu(
|
|
1466
|
+
title=dataset_data.get("table_name", ""),
|
|
1467
|
+
entity_urn=dataset_snapshot.urn,
|
|
1468
|
+
)
|
|
1469
|
+
|
|
1470
|
+
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1471
|
+
dataset_data_list = [
|
|
1472
|
+
(dataset_data,)
|
|
1473
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
|
|
1474
|
+
]
|
|
1475
|
+
yield from ThreadedIteratorExecutor.process(
|
|
1476
|
+
worker_func=self._process_dataset,
|
|
1477
|
+
args_list=dataset_data_list,
|
|
1478
|
+
max_workers=self.config.max_threads,
|
|
1479
|
+
)
|
|
797
1480
|
|
|
798
1481
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
799
|
-
|
|
800
|
-
yield from self.emit_dashboard_mces()
|
|
801
|
-
if self.config.ingest_charts:
|
|
802
|
-
yield from self.emit_chart_mces()
|
|
1482
|
+
# TODO: Possibly change ingestion order to minimize API calls
|
|
803
1483
|
if self.config.ingest_datasets:
|
|
804
1484
|
yield from self.emit_dataset_mces()
|
|
1485
|
+
if self.config.ingest_charts:
|
|
1486
|
+
yield from self.emit_chart_mces()
|
|
1487
|
+
if self.config.ingest_dashboards:
|
|
1488
|
+
yield from self.emit_dashboard_mces()
|
|
805
1489
|
|
|
806
1490
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
807
1491
|
return [
|