acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import re
|
|
@@ -12,7 +13,9 @@ from pydantic import BaseModel, Field, validator
|
|
|
12
13
|
|
|
13
14
|
from datahub.configuration.git import GitReference
|
|
14
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
15
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
16
19
|
SupportStatus,
|
|
17
20
|
capability,
|
|
18
21
|
config_class,
|
|
@@ -21,7 +24,6 @@ from datahub.ingestion.api.decorators import (
|
|
|
21
24
|
)
|
|
22
25
|
from datahub.ingestion.api.source import (
|
|
23
26
|
CapabilityReport,
|
|
24
|
-
SourceCapability,
|
|
25
27
|
TestableSource,
|
|
26
28
|
TestConnectionReport,
|
|
27
29
|
)
|
|
@@ -40,19 +42,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
|
40
42
|
logger = logging.getLogger(__name__)
|
|
41
43
|
|
|
42
44
|
|
|
45
|
+
@dataclasses.dataclass
|
|
46
|
+
class DBTCoreReport(DBTSourceReport):
|
|
47
|
+
catalog_info: Optional[dict] = None
|
|
48
|
+
manifest_info: Optional[dict] = None
|
|
49
|
+
|
|
50
|
+
|
|
43
51
|
class DBTCoreConfig(DBTCommonConfig):
|
|
44
52
|
manifest_path: str = Field(
|
|
45
|
-
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
46
|
-
"
|
|
53
|
+
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
|
|
54
|
+
"This can be a local file or a URI."
|
|
47
55
|
)
|
|
48
|
-
catalog_path: str = Field(
|
|
49
|
-
|
|
50
|
-
"
|
|
56
|
+
catalog_path: Optional[str] = Field(
|
|
57
|
+
None,
|
|
58
|
+
description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
|
|
59
|
+
"This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
|
|
60
|
+
"This can be a local file or a URI.",
|
|
51
61
|
)
|
|
52
62
|
sources_path: Optional[str] = Field(
|
|
53
63
|
default=None,
|
|
54
|
-
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json.
|
|
55
|
-
"specified, last-modified fields will not be populated.
|
|
64
|
+
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
|
|
65
|
+
"If not specified, last-modified fields will not be populated. "
|
|
66
|
+
"This can be a local file or a URI.",
|
|
56
67
|
)
|
|
57
68
|
run_results_paths: List[str] = Field(
|
|
58
69
|
default=[],
|
|
@@ -161,7 +172,7 @@ def get_columns(
|
|
|
161
172
|
|
|
162
173
|
def extract_dbt_entities(
|
|
163
174
|
all_manifest_entities: Dict[str, Dict[str, Any]],
|
|
164
|
-
all_catalog_entities: Dict[str, Dict[str, Any]],
|
|
175
|
+
all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
|
|
165
176
|
sources_results: List[Dict[str, Any]],
|
|
166
177
|
manifest_adapter: str,
|
|
167
178
|
use_identifiers: bool,
|
|
@@ -186,15 +197,6 @@ def extract_dbt_entities(
|
|
|
186
197
|
):
|
|
187
198
|
name = manifest_node["alias"]
|
|
188
199
|
|
|
189
|
-
# initialize comment to "" for consistency with descriptions
|
|
190
|
-
# (since dbt null/undefined descriptions as "")
|
|
191
|
-
comment = ""
|
|
192
|
-
|
|
193
|
-
if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
|
|
194
|
-
"comment"
|
|
195
|
-
):
|
|
196
|
-
comment = all_catalog_entities[key]["metadata"]["comment"]
|
|
197
|
-
|
|
198
200
|
materialization = None
|
|
199
201
|
if "materialized" in manifest_node.get("config", {}):
|
|
200
202
|
# It's a model
|
|
@@ -204,8 +206,9 @@ def extract_dbt_entities(
|
|
|
204
206
|
if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
|
|
205
207
|
upstream_nodes = manifest_node["depends_on"]["nodes"]
|
|
206
208
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
+
catalog_node = (
|
|
210
|
+
all_catalog_entities.get(key) if all_catalog_entities is not None else None
|
|
211
|
+
)
|
|
209
212
|
missing_from_catalog = catalog_node is None
|
|
210
213
|
catalog_type = None
|
|
211
214
|
|
|
@@ -214,16 +217,23 @@ def extract_dbt_entities(
|
|
|
214
217
|
# Test and ephemeral nodes will never show up in the catalog.
|
|
215
218
|
missing_from_catalog = False
|
|
216
219
|
else:
|
|
217
|
-
if not only_include_if_in_catalog:
|
|
220
|
+
if all_catalog_entities is not None and not only_include_if_in_catalog:
|
|
221
|
+
# If the catalog file is missing, we have already generated a general message.
|
|
218
222
|
report.warning(
|
|
219
223
|
title="Node missing from catalog",
|
|
220
224
|
message="Found a node in the manifest file but not in the catalog. "
|
|
221
225
|
"This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
|
|
222
|
-
"Some metadata,
|
|
226
|
+
"Some metadata, particularly schema information, will be impacted.",
|
|
223
227
|
context=key,
|
|
224
228
|
)
|
|
225
229
|
else:
|
|
226
|
-
catalog_type =
|
|
230
|
+
catalog_type = catalog_node["metadata"]["type"]
|
|
231
|
+
|
|
232
|
+
# initialize comment to "" for consistency with descriptions
|
|
233
|
+
# (since dbt null/undefined descriptions as "")
|
|
234
|
+
comment = ""
|
|
235
|
+
if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
|
|
236
|
+
comment = catalog_node["metadata"]["comment"]
|
|
227
237
|
|
|
228
238
|
query_tag_props = manifest_node.get("query_tag", {})
|
|
229
239
|
|
|
@@ -231,12 +241,15 @@ def extract_dbt_entities(
|
|
|
231
241
|
|
|
232
242
|
owner = meta.get("owner")
|
|
233
243
|
if owner is None:
|
|
234
|
-
owner = manifest_node.get("config", {}).get("meta"
|
|
244
|
+
owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
|
|
245
|
+
|
|
246
|
+
if not meta:
|
|
247
|
+
# On older versions of dbt, the meta field was nested under config
|
|
248
|
+
# for some node types.
|
|
249
|
+
meta = manifest_node.get("config", {}).get("meta") or {}
|
|
235
250
|
|
|
236
251
|
tags = manifest_node.get("tags", [])
|
|
237
252
|
tags = [tag_prefix + tag for tag in tags]
|
|
238
|
-
if not meta:
|
|
239
|
-
meta = manifest_node.get("config", {}).get("meta", {})
|
|
240
253
|
|
|
241
254
|
max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
|
|
242
255
|
max_loaded_at = None
|
|
@@ -343,6 +356,9 @@ class DBTRunResult(BaseModel):
|
|
|
343
356
|
def timing_map(self) -> Dict[str, DBTRunTiming]:
|
|
344
357
|
return {x.name: x for x in self.timing if x.name}
|
|
345
358
|
|
|
359
|
+
def has_success_status(self) -> bool:
|
|
360
|
+
return self.status in ("pass", "success")
|
|
361
|
+
|
|
346
362
|
|
|
347
363
|
class DBTRunMetadata(BaseModel):
|
|
348
364
|
dbt_schema_version: str
|
|
@@ -355,12 +371,7 @@ def _parse_test_result(
|
|
|
355
371
|
dbt_metadata: DBTRunMetadata,
|
|
356
372
|
run_result: DBTRunResult,
|
|
357
373
|
) -> Optional[DBTTestResult]:
|
|
358
|
-
if run_result.
|
|
359
|
-
# This was probably a docs generate run result, so this isn't actually
|
|
360
|
-
# a test result.
|
|
361
|
-
return None
|
|
362
|
-
|
|
363
|
-
if run_result.status != "pass":
|
|
374
|
+
if not run_result.has_success_status():
|
|
364
375
|
native_results = {"message": run_result.message or ""}
|
|
365
376
|
if run_result.failures:
|
|
366
377
|
native_results.update({"failures": str(run_result.failures)})
|
|
@@ -455,15 +466,19 @@ def load_run_results(
|
|
|
455
466
|
@platform_name("dbt")
|
|
456
467
|
@config_class(DBTCoreConfig)
|
|
457
468
|
@support_status(SupportStatus.CERTIFIED)
|
|
458
|
-
@capability(SourceCapability.
|
|
459
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
469
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
460
470
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
461
471
|
config: DBTCoreConfig
|
|
472
|
+
report: DBTCoreReport
|
|
473
|
+
|
|
474
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
475
|
+
super().__init__(config, ctx)
|
|
476
|
+
self.report = DBTCoreReport()
|
|
462
477
|
|
|
463
478
|
@classmethod
|
|
464
479
|
def create(cls, config_dict, ctx):
|
|
465
480
|
config = DBTCoreConfig.parse_obj(config_dict)
|
|
466
|
-
return cls(config, ctx
|
|
481
|
+
return cls(config, ctx)
|
|
467
482
|
|
|
468
483
|
@staticmethod
|
|
469
484
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -473,9 +488,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
473
488
|
DBTCoreSource.load_file_as_json(
|
|
474
489
|
source_config.manifest_path, source_config.aws_connection
|
|
475
490
|
)
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
491
|
+
if source_config.catalog_path is not None:
|
|
492
|
+
DBTCoreSource.load_file_as_json(
|
|
493
|
+
source_config.catalog_path, source_config.aws_connection
|
|
494
|
+
)
|
|
479
495
|
test_report.basic_connectivity = CapabilityReport(capable=True)
|
|
480
496
|
except Exception as e:
|
|
481
497
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -513,11 +529,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
513
529
|
dbt_manifest_json = self.load_file_as_json(
|
|
514
530
|
self.config.manifest_path, self.config.aws_connection
|
|
515
531
|
)
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
532
|
+
dbt_manifest_metadata = dbt_manifest_json["metadata"]
|
|
533
|
+
self.report.manifest_info = dict(
|
|
534
|
+
generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
|
|
535
|
+
dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
|
|
536
|
+
project_name=dbt_manifest_metadata.get("project_name", "unknown"),
|
|
519
537
|
)
|
|
520
538
|
|
|
539
|
+
dbt_catalog_json = None
|
|
540
|
+
dbt_catalog_metadata = None
|
|
541
|
+
if self.config.catalog_path is not None:
|
|
542
|
+
dbt_catalog_json = self.load_file_as_json(
|
|
543
|
+
self.config.catalog_path, self.config.aws_connection
|
|
544
|
+
)
|
|
545
|
+
dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
|
|
546
|
+
self.report.catalog_info = dict(
|
|
547
|
+
generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
|
|
548
|
+
dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
|
|
549
|
+
project_name=dbt_catalog_metadata.get("project_name", "unknown"),
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
self.report.warning(
|
|
553
|
+
title="No catalog file configured",
|
|
554
|
+
message="Some metadata, particularly schema information, will be missing.",
|
|
555
|
+
)
|
|
556
|
+
|
|
521
557
|
if self.config.sources_path is not None:
|
|
522
558
|
dbt_sources_json = self.load_file_as_json(
|
|
523
559
|
self.config.sources_path, self.config.aws_connection
|
|
@@ -530,18 +566,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
530
566
|
manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
|
|
531
567
|
manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
|
|
532
568
|
|
|
533
|
-
catalog_schema =
|
|
534
|
-
catalog_version =
|
|
569
|
+
catalog_schema = None
|
|
570
|
+
catalog_version = None
|
|
571
|
+
if dbt_catalog_metadata is not None:
|
|
572
|
+
catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
|
|
573
|
+
catalog_version = dbt_catalog_metadata.get("dbt_version")
|
|
535
574
|
|
|
536
575
|
manifest_nodes = dbt_manifest_json["nodes"]
|
|
537
576
|
manifest_sources = dbt_manifest_json["sources"]
|
|
538
577
|
|
|
539
578
|
all_manifest_entities = {**manifest_nodes, **manifest_sources}
|
|
540
579
|
|
|
541
|
-
|
|
542
|
-
|
|
580
|
+
all_catalog_entities = None
|
|
581
|
+
if dbt_catalog_json is not None:
|
|
582
|
+
catalog_nodes = dbt_catalog_json["nodes"]
|
|
583
|
+
catalog_sources = dbt_catalog_json["sources"]
|
|
543
584
|
|
|
544
|
-
|
|
585
|
+
all_catalog_entities = {**catalog_nodes, **catalog_sources}
|
|
545
586
|
|
|
546
587
|
nodes = extract_dbt_entities(
|
|
547
588
|
all_manifest_entities=all_manifest_entities,
|
|
@@ -592,7 +633,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
592
633
|
)
|
|
593
634
|
except Exception as e:
|
|
594
635
|
self.report.info(
|
|
595
|
-
title="
|
|
636
|
+
title="dbt Catalog Version",
|
|
596
637
|
message="Failed to determine the catalog version",
|
|
597
638
|
exc=e,
|
|
598
639
|
)
|
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
from datahub.emitter import mce_builder
|
|
8
8
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
10
9
|
from datahub.metadata.schema_classes import (
|
|
11
10
|
AssertionInfoClass,
|
|
12
11
|
AssertionResultClass,
|
|
@@ -43,6 +42,9 @@ class DBTTestResult:
|
|
|
43
42
|
|
|
44
43
|
native_results: Dict[str, str]
|
|
45
44
|
|
|
45
|
+
def has_success_status(self) -> bool:
|
|
46
|
+
return self.status in ("pass", "success")
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
|
|
48
50
|
"""
|
|
@@ -157,7 +159,7 @@ def make_assertion_from_test(
|
|
|
157
159
|
node: "DBTNode",
|
|
158
160
|
assertion_urn: str,
|
|
159
161
|
upstream_urn: str,
|
|
160
|
-
) ->
|
|
162
|
+
) -> MetadataChangeProposalWrapper:
|
|
161
163
|
assert node.test_info
|
|
162
164
|
qualified_test_name = node.test_info.qualified_test_name
|
|
163
165
|
column_name = node.test_info.column_name
|
|
@@ -231,7 +233,7 @@ def make_assertion_from_test(
|
|
|
231
233
|
return MetadataChangeProposalWrapper(
|
|
232
234
|
entityUrn=assertion_urn,
|
|
233
235
|
aspect=assertion_info,
|
|
234
|
-
)
|
|
236
|
+
)
|
|
235
237
|
|
|
236
238
|
|
|
237
239
|
def make_assertion_result_from_test(
|
|
@@ -240,7 +242,7 @@ def make_assertion_result_from_test(
|
|
|
240
242
|
assertion_urn: str,
|
|
241
243
|
upstream_urn: str,
|
|
242
244
|
test_warnings_are_errors: bool,
|
|
243
|
-
) ->
|
|
245
|
+
) -> MetadataChangeProposalWrapper:
|
|
244
246
|
assertionResult = AssertionRunEventClass(
|
|
245
247
|
timestampMillis=int(test_result.execution_time.timestamp() * 1000.0),
|
|
246
248
|
assertionUrn=assertion_urn,
|
|
@@ -249,7 +251,7 @@ def make_assertion_result_from_test(
|
|
|
249
251
|
result=AssertionResultClass(
|
|
250
252
|
type=(
|
|
251
253
|
AssertionResultTypeClass.SUCCESS
|
|
252
|
-
if test_result.
|
|
254
|
+
if test_result.has_success_status()
|
|
253
255
|
or (not test_warnings_are_errors and test_result.status == "warn")
|
|
254
256
|
else AssertionResultTypeClass.FAILURE
|
|
255
257
|
),
|
|
@@ -261,4 +263,4 @@ def make_assertion_result_from_test(
|
|
|
261
263
|
return MetadataChangeProposalWrapper(
|
|
262
264
|
entityUrn=assertion_urn,
|
|
263
265
|
aspect=assertionResult,
|
|
264
|
-
)
|
|
266
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import socket
|
|
3
|
+
import time
|
|
4
|
+
from typing import Iterable, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import dns.exception
|
|
8
|
+
import dns.resolver
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from datahub.configuration.common import ConfigModel
|
|
12
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
13
|
+
from datahub.ingestion.api.decorators import (
|
|
14
|
+
SupportStatus,
|
|
15
|
+
config_class,
|
|
16
|
+
platform_name,
|
|
17
|
+
support_status,
|
|
18
|
+
)
|
|
19
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
20
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataHubDebugSourceConfig(ConfigModel):
|
|
26
|
+
dns_probe_url: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@platform_name("DataHubDebug")
|
|
30
|
+
@config_class(DataHubDebugSourceConfig)
|
|
31
|
+
@support_status(SupportStatus.TESTING)
|
|
32
|
+
class DataHubDebugSource(Source):
|
|
33
|
+
"""
|
|
34
|
+
DataHubDebugSource is helper to debug things in executor where ingestion is running.
|
|
35
|
+
|
|
36
|
+
This source can perform the following tasks:
|
|
37
|
+
1. Network probe of a URL. Different from test connection in sources as that is after source starts.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, ctx: PipelineContext, config: DataHubDebugSourceConfig):
|
|
42
|
+
self.ctx = ctx
|
|
43
|
+
self.config = config
|
|
44
|
+
self.report = SourceReport()
|
|
45
|
+
self.report.event_not_produced_warn = False
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def create(cls, config_dict, ctx):
|
|
49
|
+
config = DataHubDebugSourceConfig.parse_obj(config_dict)
|
|
50
|
+
return cls(ctx, config)
|
|
51
|
+
|
|
52
|
+
def perform_dns_probe(self, url: str) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Perform comprehensive DNS probe and network connectivity tests.
|
|
55
|
+
Logs detailed information to help diagnose network issues.
|
|
56
|
+
"""
|
|
57
|
+
logger.info(f"Starting DNS probe for URL: {url}")
|
|
58
|
+
logger.info("=" * 60)
|
|
59
|
+
logger.info(f"DNS PROBE REPORT FOR: {url}")
|
|
60
|
+
logger.info("=" * 60)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
# Parse the URL to extract hostname
|
|
64
|
+
parsed_url = urlparse(
|
|
65
|
+
url if url.startswith(("http://", "https://")) else f"http://{url}"
|
|
66
|
+
)
|
|
67
|
+
hostname = parsed_url.hostname or parsed_url.netloc
|
|
68
|
+
port = parsed_url.port or (443 if parsed_url.scheme == "https" else 80)
|
|
69
|
+
|
|
70
|
+
logger.info(f"Parsed hostname: {hostname}")
|
|
71
|
+
logger.info(f"Target port: {port}")
|
|
72
|
+
logger.info(f"URL scheme: {parsed_url.scheme}")
|
|
73
|
+
logger.info("-" * 60)
|
|
74
|
+
|
|
75
|
+
# Test 1: Enhanced DNS resolution with dnspython if available
|
|
76
|
+
logger.info("1. DNS RESOLUTION TEST")
|
|
77
|
+
self._dns_probe_with_dnspython(hostname)
|
|
78
|
+
|
|
79
|
+
logger.info("-" * 60)
|
|
80
|
+
|
|
81
|
+
# Test 2: HTTP/HTTPS connectivity test with requests if available
|
|
82
|
+
logger.info("2. HTTP CONNECTIVITY TEST")
|
|
83
|
+
self._http_probe_with_requests(url)
|
|
84
|
+
|
|
85
|
+
logger.info("-" * 60)
|
|
86
|
+
|
|
87
|
+
# Test 3: System network information
|
|
88
|
+
logger.info("3. SYSTEM NETWORK INFORMATION")
|
|
89
|
+
self._log_system_network_info()
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"DNS probe failed with unexpected error: {e}", exc_info=True)
|
|
93
|
+
|
|
94
|
+
logger.info("=" * 60)
|
|
95
|
+
logger.info("DNS PROBE COMPLETED")
|
|
96
|
+
logger.info("=" * 60)
|
|
97
|
+
|
|
98
|
+
def _dns_probe_with_dnspython(self, hostname: str) -> None:
|
|
99
|
+
"""Enhanced DNS probing using dnspython library"""
|
|
100
|
+
try:
|
|
101
|
+
# Test different record types
|
|
102
|
+
record_types = ["A", "AAAA", "CNAME", "MX"]
|
|
103
|
+
|
|
104
|
+
for record_type in record_types:
|
|
105
|
+
try:
|
|
106
|
+
start_time = time.time()
|
|
107
|
+
answers = dns.resolver.resolve(hostname, record_type)
|
|
108
|
+
dns_time = time.time() - start_time
|
|
109
|
+
|
|
110
|
+
logger.info(
|
|
111
|
+
f"✓ {record_type} record resolution successful ({dns_time:.3f}s)"
|
|
112
|
+
)
|
|
113
|
+
for answer in answers:
|
|
114
|
+
logger.info(f" - {record_type}: {answer}")
|
|
115
|
+
|
|
116
|
+
except dns.resolver.NXDOMAIN:
|
|
117
|
+
logger.info(f"✗ {record_type} record: Domain does not exist")
|
|
118
|
+
except dns.resolver.NoAnswer:
|
|
119
|
+
logger.info(
|
|
120
|
+
f"- {record_type} record: No answer (record type not available)"
|
|
121
|
+
)
|
|
122
|
+
except dns.exception.Timeout:
|
|
123
|
+
logger.error(f"✗ {record_type} record: DNS query timed out")
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"✗ {record_type} record query failed: {e}")
|
|
126
|
+
|
|
127
|
+
# Test different DNS servers
|
|
128
|
+
logger.info("Testing with different DNS servers:")
|
|
129
|
+
dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
|
|
130
|
+
|
|
131
|
+
for dns_server in dns_servers:
|
|
132
|
+
try:
|
|
133
|
+
resolver = dns.resolver.Resolver()
|
|
134
|
+
resolver.nameservers = [dns_server]
|
|
135
|
+
resolver.timeout = 5
|
|
136
|
+
|
|
137
|
+
start_time = time.time()
|
|
138
|
+
answers = resolver.resolve(hostname, "A")
|
|
139
|
+
dns_time = time.time() - start_time
|
|
140
|
+
|
|
141
|
+
logger.info(
|
|
142
|
+
f"✓ DNS server {dns_server} responded ({dns_time:.3f}s)"
|
|
143
|
+
)
|
|
144
|
+
for answer in answers:
|
|
145
|
+
logger.info(f" - A: {answer}")
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"✗ DNS server {dns_server} failed: {e}")
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"Enhanced DNS probe failed: {e}", exc_info=True)
|
|
152
|
+
|
|
153
|
+
def _http_probe_with_requests(self, url: str) -> None:
|
|
154
|
+
"""HTTP connectivity test using requests library"""
|
|
155
|
+
try:
|
|
156
|
+
# Test with different timeouts and methods
|
|
157
|
+
timeout = 10
|
|
158
|
+
allow_redirects_head = True
|
|
159
|
+
allow_redirects_get = False
|
|
160
|
+
|
|
161
|
+
# Test HEAD request
|
|
162
|
+
try:
|
|
163
|
+
logger.info(f"Testing HEAD request with timeout {timeout}s")
|
|
164
|
+
start_time = time.time()
|
|
165
|
+
|
|
166
|
+
response = requests.head(
|
|
167
|
+
url, timeout=timeout, allow_redirects=allow_redirects_head
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
request_time = time.time() - start_time
|
|
171
|
+
|
|
172
|
+
logger.info(f"✓ HEAD request successful ({request_time:.3f}s)")
|
|
173
|
+
logger.info(f" Status code: {response.status_code}")
|
|
174
|
+
logger.info(
|
|
175
|
+
f" Response headers: {dict(list(response.headers.items())[:5])}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if hasattr(response, "url") and response.url != url:
|
|
179
|
+
logger.info(f" Final URL after redirects: {response.url}")
|
|
180
|
+
|
|
181
|
+
except requests.exceptions.Timeout:
|
|
182
|
+
logger.error(f"✗ HEAD request timed out after {timeout}s")
|
|
183
|
+
except requests.exceptions.ConnectionError as e:
|
|
184
|
+
logger.error(f"✗ HEAD connection error: {e}")
|
|
185
|
+
except requests.exceptions.RequestException as e:
|
|
186
|
+
logger.error(f"✗ HEAD request failed: {e}")
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.error(f"✗ HEAD unexpected error: {e}")
|
|
189
|
+
|
|
190
|
+
# Test GET request
|
|
191
|
+
try:
|
|
192
|
+
logger.info(f"Testing GET request with timeout {timeout}s")
|
|
193
|
+
start_time = time.time()
|
|
194
|
+
|
|
195
|
+
response = requests.get(
|
|
196
|
+
url, timeout=timeout, allow_redirects=allow_redirects_get
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
request_time = time.time() - start_time
|
|
200
|
+
|
|
201
|
+
logger.info(f"✓ GET request successful ({request_time:.3f}s)")
|
|
202
|
+
logger.info(f" Status code: {response.status_code}")
|
|
203
|
+
logger.info(
|
|
204
|
+
f" Response headers: {dict(list(response.headers.items())[:5])}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if hasattr(response, "url") and response.url != url:
|
|
208
|
+
logger.info(f" Final URL after redirects: {response.url}")
|
|
209
|
+
|
|
210
|
+
except requests.exceptions.Timeout:
|
|
211
|
+
logger.error(f"✗ GET request timed out after {timeout}s")
|
|
212
|
+
except requests.exceptions.ConnectionError as e:
|
|
213
|
+
logger.error(f"✗ GET connection error: {e}")
|
|
214
|
+
except requests.exceptions.RequestException as e:
|
|
215
|
+
logger.error(f"✗ GET request failed: {e}")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"✗ GET unexpected error: {e}")
|
|
218
|
+
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"HTTP probe failed: {e}", exc_info=True)
|
|
221
|
+
|
|
222
|
+
def _log_dns_troubleshooting(self) -> None:
|
|
223
|
+
"""Log DNS troubleshooting information"""
|
|
224
|
+
logger.info("DNS TROUBLESHOOTING SUGGESTIONS:")
|
|
225
|
+
logger.info("- Check if the hostname is correct")
|
|
226
|
+
logger.info("- Verify DNS server configuration")
|
|
227
|
+
logger.info("- Check network connectivity")
|
|
228
|
+
logger.info("- Try using a different DNS server (8.8.8.8, 1.1.1.1)")
|
|
229
|
+
logger.info("- Check if there are firewall restrictions")
|
|
230
|
+
|
|
231
|
+
def _log_system_network_info(self) -> None:
|
|
232
|
+
"""Log system network configuration information"""
|
|
233
|
+
try:
|
|
234
|
+
local_hostname = socket.gethostname()
|
|
235
|
+
logger.info(f"Local hostname: {local_hostname}")
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
local_ips = socket.getaddrinfo(local_hostname, None)
|
|
239
|
+
logger.info("Local IP addresses:")
|
|
240
|
+
for addr_info in local_ips:
|
|
241
|
+
if addr_info[0] in [socket.AF_INET, socket.AF_INET6]:
|
|
242
|
+
family = "IPv4" if addr_info[0] == socket.AF_INET else "IPv6"
|
|
243
|
+
logger.info(f" - {addr_info[4][0]} ({family})")
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.warning(f"Could not retrieve local IP addresses: {e}")
|
|
246
|
+
|
|
247
|
+
logger.info("DNS Server Connectivity:")
|
|
248
|
+
dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
|
|
249
|
+
for dns_server in dns_servers:
|
|
250
|
+
try:
|
|
251
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
252
|
+
sock.settimeout(5)
|
|
253
|
+
result = sock.connect_ex((dns_server, 53))
|
|
254
|
+
if result == 0:
|
|
255
|
+
logger.info(f" ✓ Can reach {dns_server}:53")
|
|
256
|
+
else:
|
|
257
|
+
logger.error(f" ✗ Cannot reach {dns_server}:53")
|
|
258
|
+
sock.close()
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.error(f" ✗ Error testing {dns_server}:53 - {e}")
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.warning(f"Could not gather system network info: {e}")
|
|
264
|
+
|
|
265
|
+
def _test_alternative_dns(self, hostname: str) -> None:
|
|
266
|
+
"""Test hostname resolution using alternative methods"""
|
|
267
|
+
try:
|
|
268
|
+
families = [(socket.AF_INET, "IPv4"), (socket.AF_INET6, "IPv6")]
|
|
269
|
+
|
|
270
|
+
for family, family_name in families:
|
|
271
|
+
try:
|
|
272
|
+
result = socket.getaddrinfo(hostname, None, family)
|
|
273
|
+
if result:
|
|
274
|
+
logger.info(f"✓ {family_name} resolution successful:")
|
|
275
|
+
for addr_info in result[:3]:
|
|
276
|
+
logger.info(f" - {addr_info[4][0]}")
|
|
277
|
+
else:
|
|
278
|
+
logger.warning(
|
|
279
|
+
f"✗ {family_name} resolution returned no results"
|
|
280
|
+
)
|
|
281
|
+
except socket.gaierror:
|
|
282
|
+
logger.error(f"✗ {family_name} resolution failed")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"✗ {family_name} resolution error: {e}")
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(f"Alternative DNS test failed: {e}")
|
|
288
|
+
|
|
289
|
+
def get_workunits_internal(
|
|
290
|
+
self,
|
|
291
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
292
|
+
if self.config.dns_probe_url is not None:
|
|
293
|
+
# Perform DNS probe
|
|
294
|
+
logger.info(f"Performing DNS probe for: {self.config.dns_probe_url}")
|
|
295
|
+
self.perform_dns_probe(self.config.dns_probe_url)
|
|
296
|
+
|
|
297
|
+
yield from []
|
|
298
|
+
|
|
299
|
+
def get_report(self) -> SourceReport:
|
|
300
|
+
return self.report
|
|
@@ -13,8 +13,9 @@ from datahub.configuration.source_common import (
|
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
|
-
from datahub.ingestion.source.state.
|
|
16
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
17
17
|
StatefulIngestionConfigBase,
|
|
18
|
+
StatefulStaleMetadataRemovalConfig,
|
|
18
19
|
)
|
|
19
20
|
|
|
20
21
|
# hide annoying debug errors from py4j
|
|
@@ -39,9 +40,7 @@ class S3(ConfigModel):
|
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
class DeltaLakeSourceConfig(
|
|
42
|
-
PlatformInstanceConfigMixin,
|
|
43
|
-
EnvConfigMixin,
|
|
44
|
-
StatefulIngestionConfigBase,
|
|
43
|
+
PlatformInstanceConfigMixin, EnvConfigMixin, StatefulIngestionConfigBase
|
|
45
44
|
):
|
|
46
45
|
base_path: str = Field(
|
|
47
46
|
description="Path to table (s3 or local file system). If path is not a delta table path "
|
|
@@ -78,7 +77,12 @@ class DeltaLakeSourceConfig(
|
|
|
78
77
|
"When set to `False`, number_of_files in delta table can not be reported.",
|
|
79
78
|
)
|
|
80
79
|
|
|
81
|
-
s3: Optional[S3] = Field()
|
|
80
|
+
s3: Optional[S3] = Field(None)
|
|
81
|
+
|
|
82
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Stateful Ingestion Config with stale metadata removal",
|
|
85
|
+
)
|
|
82
86
|
|
|
83
87
|
@cached_property
|
|
84
88
|
def is_s3(self):
|