acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ from typing import Optional, Set
|
|
|
4
4
|
import pydantic
|
|
5
5
|
from pydantic import Field, root_validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
9
9
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
10
10
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
98
98
|
),
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
pull_from_datahub_api: bool = Field(
|
|
101
|
+
pull_from_datahub_api: HiddenFromDocs[bool] = Field(
|
|
102
102
|
default=False,
|
|
103
103
|
description="Use the DataHub API to fetch versioned aspects.",
|
|
104
|
-
hidden_from_docs=True,
|
|
105
104
|
)
|
|
106
105
|
|
|
107
|
-
max_workers: int = Field(
|
|
106
|
+
max_workers: HiddenFromDocs[int] = Field(
|
|
108
107
|
default=5 * (os.cpu_count() or 4),
|
|
109
108
|
description="Number of worker threads to use for datahub api ingestion.",
|
|
110
|
-
hidden_from_docs=True,
|
|
111
109
|
)
|
|
112
110
|
|
|
113
111
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
118
116
|
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
117
|
)
|
|
120
118
|
|
|
121
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
119
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
120
|
+
Field(
|
|
121
|
+
default=60,
|
|
122
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
123
|
+
)
|
|
125
124
|
)
|
|
126
125
|
|
|
127
126
|
query_timeout: Optional[int] = Field(
|
|
@@ -129,6 +128,10 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
129
128
|
description="Timeout for each query in seconds. ",
|
|
130
129
|
)
|
|
131
130
|
|
|
131
|
+
preserve_system_metadata: bool = Field(
|
|
132
|
+
default=True, description="Copy system metadata from the source system"
|
|
133
|
+
)
|
|
134
|
+
|
|
132
135
|
@root_validator(skip_on_failure=True)
|
|
133
136
|
def check_ingesting_data(cls, values):
|
|
134
137
|
if (
|
|
@@ -12,7 +12,7 @@ from datahub.emitter.serialization_helper import post_json_transform
|
|
|
12
12
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
13
13
|
from datahub.ingestion.source.datahub.report import DataHubSourceReport
|
|
14
14
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
15
|
-
from datahub.metadata.schema_classes import
|
|
15
|
+
from datahub.metadata.schema_classes import SystemMetadataClass
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -104,6 +104,22 @@ class DataHubDatabaseReader:
|
|
|
104
104
|
ORDER BY mav.urn
|
|
105
105
|
"""
|
|
106
106
|
|
|
107
|
+
def _get_json_extract_expression(self) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Returns the appropriate JSON extraction expression based on the database dialect.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Database-specific JSON extraction expression
|
|
113
|
+
"""
|
|
114
|
+
# Return the correct JSON extraction expression for the "removed" field,
|
|
115
|
+
# depending on the database dialect.
|
|
116
|
+
if self.engine.dialect.name == "postgresql":
|
|
117
|
+
# For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
|
|
118
|
+
return "((metadata::json)->>'removed')::boolean"
|
|
119
|
+
else:
|
|
120
|
+
# For other databases (e.g., MySQL), use JSON_EXTRACT.
|
|
121
|
+
return "JSON_EXTRACT(metadata, '$.removed')"
|
|
122
|
+
|
|
107
123
|
def query(self, set_structured_properties_filter: bool) -> str:
|
|
108
124
|
"""
|
|
109
125
|
Main query that gets data for specified date range with appropriate filters.
|
|
@@ -125,7 +141,7 @@ class DataHubDatabaseReader:
|
|
|
125
141
|
LEFT JOIN (
|
|
126
142
|
SELECT
|
|
127
143
|
*,
|
|
128
|
-
|
|
144
|
+
{self._get_json_extract_expression()} as removed
|
|
129
145
|
FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
|
|
130
146
|
WHERE aspect = 'status'
|
|
131
147
|
AND version = 0
|
|
@@ -241,15 +257,10 @@ class DataHubDatabaseReader:
|
|
|
241
257
|
"end_createdon": end_date.strftime(DATETIME_FORMAT),
|
|
242
258
|
"limit": limit,
|
|
243
259
|
"offset": offset,
|
|
260
|
+
# Always pass exclude_aspects as a tuple, postgres doesn't support lists
|
|
261
|
+
"exclude_aspects": tuple(self.config.exclude_aspects),
|
|
244
262
|
}
|
|
245
263
|
|
|
246
|
-
# Add exclude_aspects if needed
|
|
247
|
-
if (
|
|
248
|
-
hasattr(self.config, "exclude_aspects")
|
|
249
|
-
and self.config.exclude_aspects
|
|
250
|
-
):
|
|
251
|
-
params["exclude_aspects"] = tuple(self.config.exclude_aspects)
|
|
252
|
-
|
|
253
264
|
logger.info(
|
|
254
265
|
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
|
|
255
266
|
f"with limit {limit} and offset {offset} (inclusive range)"
|
|
@@ -369,12 +380,16 @@ class DataHubDatabaseReader:
|
|
|
369
380
|
json_metadata = post_json_transform(
|
|
370
381
|
json.loads(row["systemmetadata"] or "{}")
|
|
371
382
|
)
|
|
372
|
-
system_metadata =
|
|
383
|
+
system_metadata = None
|
|
384
|
+
if self.config.preserve_system_metadata:
|
|
385
|
+
system_metadata = SystemMetadataClass.from_obj(json_metadata)
|
|
386
|
+
if system_metadata.properties:
|
|
387
|
+
is_no_op = system_metadata.properties.pop("isNoOp", None)
|
|
388
|
+
logger.debug(f"Removed potential value for is_no_op={is_no_op}")
|
|
373
389
|
return MetadataChangeProposalWrapper(
|
|
374
390
|
entityUrn=row["urn"],
|
|
375
391
|
aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
|
|
376
392
|
systemMetadata=system_metadata,
|
|
377
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
378
393
|
)
|
|
379
394
|
except Exception as e:
|
|
380
395
|
logger.warning(
|
|
@@ -6,7 +6,9 @@ from typing import Dict, Iterable, List, Optional
|
|
|
6
6
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
7
7
|
from datahub.ingestion.api.common import PipelineContext
|
|
8
8
|
from datahub.ingestion.api.decorators import (
|
|
9
|
+
SourceCapability,
|
|
9
10
|
SupportStatus,
|
|
11
|
+
capability,
|
|
10
12
|
config_class,
|
|
11
13
|
platform_name,
|
|
12
14
|
support_status,
|
|
@@ -17,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
17
19
|
auto_workunit_reporter,
|
|
18
20
|
)
|
|
19
21
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
20
23
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
21
24
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
22
25
|
from datahub.ingestion.source.datahub.datahub_database_reader import (
|
|
@@ -37,6 +40,13 @@ logger = logging.getLogger(__name__)
|
|
|
37
40
|
@platform_name("DataHub")
|
|
38
41
|
@config_class(DataHubSourceConfig)
|
|
39
42
|
@support_status(SupportStatus.TESTING)
|
|
43
|
+
@capability(
|
|
44
|
+
SourceCapability.CONTAINERS,
|
|
45
|
+
"Enabled by default",
|
|
46
|
+
subtype_modifier=[
|
|
47
|
+
SourceCapabilityModifier.DATABASE,
|
|
48
|
+
],
|
|
49
|
+
)
|
|
40
50
|
class DataHubSource(StatefulIngestionSourceBase):
|
|
41
51
|
platform: str = "datahub"
|
|
42
52
|
|
|
@@ -9,7 +9,9 @@ import requests
|
|
|
9
9
|
from pydantic import Field, root_validator
|
|
10
10
|
|
|
11
11
|
from datahub.ingestion.api.decorators import (
|
|
12
|
+
SourceCapability,
|
|
12
13
|
SupportStatus,
|
|
14
|
+
capability,
|
|
13
15
|
config_class,
|
|
14
16
|
platform_name,
|
|
15
17
|
support_status,
|
|
@@ -24,6 +26,7 @@ from datahub.ingestion.source.dbt.dbt_common import (
|
|
|
24
26
|
DBTCommonConfig,
|
|
25
27
|
DBTNode,
|
|
26
28
|
DBTSourceBase,
|
|
29
|
+
DBTSourceReport,
|
|
27
30
|
)
|
|
28
31
|
from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
29
32
|
|
|
@@ -261,8 +264,10 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
|
|
|
261
264
|
@platform_name("dbt")
|
|
262
265
|
@config_class(DBTCloudConfig)
|
|
263
266
|
@support_status(SupportStatus.CERTIFIED)
|
|
267
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
264
268
|
class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
265
269
|
config: DBTCloudConfig
|
|
270
|
+
report: DBTSourceReport # nothing cloud-specific in the report
|
|
266
271
|
|
|
267
272
|
@classmethod
|
|
268
273
|
def create(cls, config_dict, ctx):
|
|
@@ -365,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
365
370
|
name = node["alias"]
|
|
366
371
|
|
|
367
372
|
comment = node.get("comment", "")
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
373
|
+
|
|
374
|
+
# In dbt sources, there are two types of descriptions:
|
|
375
|
+
# - description: table-level description (specific to the source table)
|
|
376
|
+
# - sourceDescription: schema-level description (describes the overall source schema)
|
|
377
|
+
# The table-level description should take precedence since it's more specific.
|
|
378
|
+
description = node["description"] or node.get("sourceDescription", "")
|
|
371
379
|
|
|
372
380
|
if node["resourceType"] == "model":
|
|
373
381
|
materialization = node["materializedType"]
|
|
@@ -401,8 +409,11 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
401
409
|
if node["resourceType"] in {"model", "seed", "snapshot"}:
|
|
402
410
|
status = node["status"]
|
|
403
411
|
if status is None and materialization != "ephemeral":
|
|
404
|
-
self.report.
|
|
405
|
-
|
|
412
|
+
self.report.warning(
|
|
413
|
+
title="Schema information may be incomplete",
|
|
414
|
+
message="Some nodes are missing the `status` field, which dbt uses to track the status of the node in the target database.",
|
|
415
|
+
context=key,
|
|
416
|
+
log=False,
|
|
406
417
|
)
|
|
407
418
|
|
|
408
419
|
# The code fields are new in dbt 1.3, and replace the sql ones.
|
|
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
|
|
|
91
91
|
OwnershipClass,
|
|
92
92
|
OwnershipSourceTypeClass,
|
|
93
93
|
OwnershipTypeClass,
|
|
94
|
+
SiblingsClass,
|
|
94
95
|
StatusClass,
|
|
95
96
|
SubTypesClass,
|
|
96
97
|
TagAssociationClass,
|
|
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
|
|
|
98
99
|
ViewPropertiesClass,
|
|
99
100
|
)
|
|
100
101
|
from datahub.metadata.urns import DatasetUrn
|
|
102
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
101
103
|
from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
|
|
102
104
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
103
105
|
SqlParsingDebugInfo,
|
|
@@ -120,6 +122,7 @@ logger = logging.getLogger(__name__)
|
|
|
120
122
|
DBT_PLATFORM = "dbt"
|
|
121
123
|
|
|
122
124
|
_DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
|
125
|
+
_DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
@dataclass
|
|
@@ -145,6 +148,9 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
|
145
148
|
|
|
146
149
|
nodes_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
147
150
|
|
|
151
|
+
duplicate_sources_dropped: Optional[int] = None
|
|
152
|
+
duplicate_sources_references_updated: Optional[int] = None
|
|
153
|
+
|
|
148
154
|
|
|
149
155
|
class EmitDirective(ConfigEnum):
|
|
150
156
|
"""A holder for directives for emission for specific types of entities"""
|
|
@@ -240,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
|
|
|
240
246
|
return self.model_performance == EmitDirective.YES
|
|
241
247
|
|
|
242
248
|
|
|
249
|
+
class MaterializedNodePatternConfig(ConfigModel):
|
|
250
|
+
"""Configuration for filtering materialized nodes based on their physical location"""
|
|
251
|
+
|
|
252
|
+
database_pattern: AllowDenyPattern = Field(
|
|
253
|
+
default=AllowDenyPattern.allow_all(),
|
|
254
|
+
description="Regex patterns for database names to filter materialized nodes.",
|
|
255
|
+
)
|
|
256
|
+
schema_pattern: AllowDenyPattern = Field(
|
|
257
|
+
default=AllowDenyPattern.allow_all(),
|
|
258
|
+
description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
|
|
259
|
+
)
|
|
260
|
+
table_pattern: AllowDenyPattern = Field(
|
|
261
|
+
default=AllowDenyPattern.allow_all(),
|
|
262
|
+
description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
243
266
|
class DBTCommonConfig(
|
|
244
267
|
StatefulIngestionConfigBase,
|
|
245
268
|
PlatformInstanceConfigMixin,
|
|
@@ -288,6 +311,11 @@ class DBTCommonConfig(
|
|
|
288
311
|
default=AllowDenyPattern.allow_all(),
|
|
289
312
|
description="regex patterns for dbt model names to filter in ingestion.",
|
|
290
313
|
)
|
|
314
|
+
materialized_node_pattern: MaterializedNodePatternConfig = Field(
|
|
315
|
+
default=MaterializedNodePatternConfig(),
|
|
316
|
+
description="Advanced filtering for materialized nodes based on their physical database location. "
|
|
317
|
+
"Provides fine-grained control over database.schema.table patterns for catalog consistency.",
|
|
318
|
+
)
|
|
291
319
|
meta_mapping: Dict = Field(
|
|
292
320
|
default={},
|
|
293
321
|
description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
|
|
@@ -355,7 +383,7 @@ class DBTCommonConfig(
|
|
|
355
383
|
# override default value to True.
|
|
356
384
|
incremental_lineage: bool = Field(
|
|
357
385
|
default=True,
|
|
358
|
-
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
|
|
386
|
+
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
|
|
359
387
|
)
|
|
360
388
|
|
|
361
389
|
_remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
|
|
@@ -370,6 +398,20 @@ class DBTCommonConfig(
|
|
|
370
398
|
"Set to False to skip it for engines like AWS Athena where it's not required.",
|
|
371
399
|
)
|
|
372
400
|
|
|
401
|
+
dbt_is_primary_sibling: bool = Field(
|
|
402
|
+
default=True,
|
|
403
|
+
description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
|
|
404
|
+
"When True (default), dbt entities are primary and target platform entities are secondary. "
|
|
405
|
+
"When False, target platform entities are primary and dbt entities are secondary. "
|
|
406
|
+
"Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
drop_duplicate_sources: bool = Field(
|
|
410
|
+
default=True,
|
|
411
|
+
description="When enabled, drops sources that have the same name in the target platform as a model. "
|
|
412
|
+
"This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
|
|
413
|
+
)
|
|
414
|
+
|
|
373
415
|
@validator("target_platform")
|
|
374
416
|
def validate_target_platform_value(cls, target_platform: str) -> str:
|
|
375
417
|
if target_platform.lower() == DBT_PLATFORM:
|
|
@@ -509,7 +551,7 @@ class DBTNode:
|
|
|
509
551
|
raw_code: Optional[str]
|
|
510
552
|
|
|
511
553
|
dbt_adapter: str
|
|
512
|
-
dbt_name: str
|
|
554
|
+
dbt_name: str # dbt unique identifier
|
|
513
555
|
dbt_file_path: Optional[str]
|
|
514
556
|
dbt_package_name: Optional[str] # this is pretty much always present
|
|
515
557
|
|
|
@@ -823,7 +865,9 @@ def get_column_type(
|
|
|
823
865
|
@platform_name("dbt")
|
|
824
866
|
@config_class(DBTCommonConfig)
|
|
825
867
|
@support_status(SupportStatus.CERTIFIED)
|
|
826
|
-
@capability(
|
|
868
|
+
@capability(
|
|
869
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
870
|
+
)
|
|
827
871
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
828
872
|
@capability(
|
|
829
873
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -973,6 +1017,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
973
1017
|
self._infer_schemas_and_update_cll(all_nodes_map)
|
|
974
1018
|
|
|
975
1019
|
nodes = self._filter_nodes(all_nodes)
|
|
1020
|
+
nodes = self._drop_duplicate_sources(nodes)
|
|
1021
|
+
|
|
976
1022
|
non_test_nodes = [
|
|
977
1023
|
dataset_node for dataset_node in nodes if dataset_node.node_type != "test"
|
|
978
1024
|
]
|
|
@@ -994,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
994
1040
|
all_nodes_map,
|
|
995
1041
|
)
|
|
996
1042
|
|
|
997
|
-
def _is_allowed_node(self,
|
|
998
|
-
|
|
1043
|
+
def _is_allowed_node(self, node: DBTNode) -> bool:
|
|
1044
|
+
"""
|
|
1045
|
+
Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
|
|
1046
|
+
"""
|
|
1047
|
+
if not self.config.node_name_pattern.allowed(node.dbt_name):
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
if not self._is_allowed_materialized_node(node):
|
|
1051
|
+
return False
|
|
1052
|
+
|
|
1053
|
+
return True
|
|
1054
|
+
|
|
1055
|
+
def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
|
|
1056
|
+
"""Filter nodes based on their materialized database location for catalog consistency"""
|
|
1057
|
+
|
|
1058
|
+
# Database level filtering
|
|
1059
|
+
if not node.database:
|
|
1060
|
+
return True
|
|
1061
|
+
if not self.config.materialized_node_pattern.database_pattern.allowed(
|
|
1062
|
+
node.database
|
|
1063
|
+
):
|
|
1064
|
+
return False
|
|
1065
|
+
|
|
1066
|
+
# Schema level filtering: {database}.{schema}
|
|
1067
|
+
if not node.schema:
|
|
1068
|
+
return True
|
|
1069
|
+
if not self.config.materialized_node_pattern.schema_pattern.allowed(
|
|
1070
|
+
node._join_parts([node.database, node.schema])
|
|
1071
|
+
):
|
|
1072
|
+
return False
|
|
1073
|
+
|
|
1074
|
+
# Table level filtering: {database}.{schema}.{table}
|
|
1075
|
+
if not node.name:
|
|
1076
|
+
return True
|
|
1077
|
+
if not self.config.materialized_node_pattern.table_pattern.allowed(
|
|
1078
|
+
node.get_db_fqn()
|
|
1079
|
+
):
|
|
1080
|
+
return False
|
|
1081
|
+
|
|
1082
|
+
return True
|
|
999
1083
|
|
|
1000
1084
|
def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
|
|
1001
|
-
nodes = []
|
|
1085
|
+
nodes: List[DBTNode] = []
|
|
1002
1086
|
for node in all_nodes:
|
|
1003
1087
|
key = node.dbt_name
|
|
1004
1088
|
|
|
1005
|
-
if not self._is_allowed_node(
|
|
1089
|
+
if not self._is_allowed_node(node):
|
|
1006
1090
|
self.report.nodes_filtered.append(key)
|
|
1007
1091
|
continue
|
|
1008
1092
|
|
|
@@ -1010,6 +1094,62 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1010
1094
|
|
|
1011
1095
|
return nodes
|
|
1012
1096
|
|
|
1097
|
+
def _drop_duplicate_sources(self, original_nodes: List[DBTNode]) -> List[DBTNode]:
|
|
1098
|
+
"""Detect and correct cases where a model and source have the same name.
|
|
1099
|
+
|
|
1100
|
+
In these cases, we don't want to generate both because they'll have the same
|
|
1101
|
+
urn and hence overwrite each other. Instead, we drop the source and update
|
|
1102
|
+
references to it to point at the model.
|
|
1103
|
+
|
|
1104
|
+
The risk here is that the source might have documentation that'd be lost,
|
|
1105
|
+
which is why we maintain optionality with a config flag.
|
|
1106
|
+
"""
|
|
1107
|
+
if not self.config.drop_duplicate_sources:
|
|
1108
|
+
return original_nodes
|
|
1109
|
+
|
|
1110
|
+
self.report.duplicate_sources_dropped = 0
|
|
1111
|
+
self.report.duplicate_sources_references_updated = 0
|
|
1112
|
+
|
|
1113
|
+
# Pass 1 - find all model names in the warehouse.
|
|
1114
|
+
warehouse_model_names: Dict[str, str] = {} # warehouse name -> model unique id
|
|
1115
|
+
for node in original_nodes:
|
|
1116
|
+
if node.node_type == "model" and node.exists_in_target_platform:
|
|
1117
|
+
warehouse_model_names[node.get_db_fqn()] = node.dbt_name
|
|
1118
|
+
|
|
1119
|
+
# Pass 2 - identify + drop duplicate sources.
|
|
1120
|
+
source_references_to_update: Dict[
|
|
1121
|
+
str, str
|
|
1122
|
+
] = {} # source unique id -> model unique id
|
|
1123
|
+
nodes: List[DBTNode] = []
|
|
1124
|
+
for node in original_nodes:
|
|
1125
|
+
if (
|
|
1126
|
+
node.node_type == "source"
|
|
1127
|
+
and node.exists_in_target_platform
|
|
1128
|
+
and (model_name := warehouse_model_names.get(node.get_db_fqn()))
|
|
1129
|
+
):
|
|
1130
|
+
self.report.warning(
|
|
1131
|
+
title="Duplicate model and source names detected",
|
|
1132
|
+
message="We found a dbt model and dbt source with the same name. To ensure reliable lineage generation, the source node was ignored. "
|
|
1133
|
+
"If you associated documentation/tags/other metadata with the source, it will be lost. "
|
|
1134
|
+
"To avoid this, you should remove the source node from your dbt project and replace any `source(<source_name>)` calls with `ref(<model_name>)`.",
|
|
1135
|
+
context=f"{node.dbt_name} (called {node.get_db_fqn()} in {self.config.target_platform}) duplicates {model_name}",
|
|
1136
|
+
)
|
|
1137
|
+
self.report.duplicate_sources_dropped += 1
|
|
1138
|
+
source_references_to_update[node.dbt_name] = model_name
|
|
1139
|
+
else:
|
|
1140
|
+
nodes.append(node)
|
|
1141
|
+
|
|
1142
|
+
# Pass 3 - update references to the dropped sources.
|
|
1143
|
+
for node in nodes:
|
|
1144
|
+
for i, current_upstream in enumerate(node.upstream_nodes):
|
|
1145
|
+
if current_upstream in source_references_to_update:
|
|
1146
|
+
node.upstream_nodes[i] = source_references_to_update[
|
|
1147
|
+
current_upstream
|
|
1148
|
+
]
|
|
1149
|
+
self.report.duplicate_sources_references_updated += 1
|
|
1150
|
+
|
|
1151
|
+
return nodes
|
|
1152
|
+
|
|
1013
1153
|
@staticmethod
|
|
1014
1154
|
def _to_schema_info(schema_fields: List[SchemaField]) -> SchemaInfo:
|
|
1015
1155
|
return {column.fieldPath: column.nativeDataType for column in schema_fields}
|
|
@@ -1038,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1038
1178
|
cll_nodes.add(dbt_name)
|
|
1039
1179
|
schema_nodes.add(dbt_name)
|
|
1040
1180
|
|
|
1041
|
-
for dbt_name in all_nodes_map:
|
|
1042
|
-
if self._is_allowed_node(
|
|
1181
|
+
for dbt_name, dbt_node in all_nodes_map.items():
|
|
1182
|
+
if self._is_allowed_node(dbt_node):
|
|
1043
1183
|
add_node_to_cll_list(dbt_name)
|
|
1044
1184
|
|
|
1045
1185
|
return schema_nodes, cll_nodes
|
|
@@ -1334,6 +1474,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1334
1474
|
self.config.tag_prefix,
|
|
1335
1475
|
"SOURCE_CONTROL",
|
|
1336
1476
|
self.config.strip_user_ids_from_email,
|
|
1477
|
+
match_nested_props=True,
|
|
1337
1478
|
)
|
|
1338
1479
|
|
|
1339
1480
|
action_processor_tag = OperationProcessor(
|
|
@@ -1405,6 +1546,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1405
1546
|
dataset_snapshot = DatasetSnapshot(
|
|
1406
1547
|
urn=node_datahub_urn, aspects=list(snapshot_aspects)
|
|
1407
1548
|
)
|
|
1549
|
+
# Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
|
|
1550
|
+
if self._should_create_sibling_relationships(node):
|
|
1551
|
+
# Get the target platform URN
|
|
1552
|
+
target_platform_urn = node.get_urn(
|
|
1553
|
+
self.config.target_platform,
|
|
1554
|
+
self.config.env,
|
|
1555
|
+
self.config.target_platform_instance,
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
yield MetadataChangeProposalWrapper(
|
|
1559
|
+
entityUrn=node_datahub_urn,
|
|
1560
|
+
aspect=SiblingsClass(
|
|
1561
|
+
siblings=[target_platform_urn],
|
|
1562
|
+
primary=self.config.dbt_is_primary_sibling,
|
|
1563
|
+
),
|
|
1564
|
+
).as_workunit()
|
|
1565
|
+
|
|
1408
1566
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1409
1567
|
if self.config.write_semantics == "PATCH":
|
|
1410
1568
|
mce = self.get_patched_mce(mce)
|
|
@@ -1508,6 +1666,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1508
1666
|
if not node.exists_in_target_platform:
|
|
1509
1667
|
continue
|
|
1510
1668
|
|
|
1669
|
+
# Emit sibling patch for target platform entity BEFORE any other aspects.
|
|
1670
|
+
# This ensures the hook can detect explicit primary settings when processing later aspects.
|
|
1671
|
+
if self._should_create_sibling_relationships(node):
|
|
1672
|
+
# Get the dbt platform URN
|
|
1673
|
+
dbt_platform_urn = node.get_urn(
|
|
1674
|
+
DBT_PLATFORM,
|
|
1675
|
+
self.config.env,
|
|
1676
|
+
self.config.platform_instance,
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
# Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
|
|
1680
|
+
target_patch = DatasetPatchBuilder(node_datahub_urn)
|
|
1681
|
+
target_patch.add_sibling(
|
|
1682
|
+
dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
|
|
1683
|
+
)
|
|
1684
|
+
|
|
1685
|
+
yield from auto_workunit(
|
|
1686
|
+
MetadataWorkUnit(
|
|
1687
|
+
id=MetadataWorkUnit.generate_workunit_id(mcp),
|
|
1688
|
+
mcp_raw=mcp,
|
|
1689
|
+
is_primary_source=False, # Not authoritative over warehouse metadata
|
|
1690
|
+
)
|
|
1691
|
+
for mcp in target_patch.build()
|
|
1692
|
+
)
|
|
1693
|
+
|
|
1511
1694
|
# This code block is run when we are generating entities of platform type.
|
|
1512
1695
|
# We will not link the platform not to the dbt node for type "source" because
|
|
1513
1696
|
# in this case the platform table existed first.
|
|
@@ -1614,6 +1797,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1614
1797
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
|
1615
1798
|
pass
|
|
1616
1799
|
|
|
1800
|
+
@staticmethod
|
|
1801
|
+
def _truncate_code(code: str, max_length: int) -> str:
|
|
1802
|
+
if len(code) > max_length:
|
|
1803
|
+
return code[:max_length] + "..."
|
|
1804
|
+
return code
|
|
1805
|
+
|
|
1617
1806
|
def _create_view_properties_aspect(
|
|
1618
1807
|
self, node: DBTNode
|
|
1619
1808
|
) -> Optional[ViewPropertiesClass]:
|
|
@@ -1625,6 +1814,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1625
1814
|
compiled_code = try_format_query(
|
|
1626
1815
|
node.compiled_code, platform=self.config.target_platform
|
|
1627
1816
|
)
|
|
1817
|
+
compiled_code = self._truncate_code(
|
|
1818
|
+
compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
|
|
1819
|
+
)
|
|
1628
1820
|
|
|
1629
1821
|
materialized = node.materialization in {"table", "incremental", "snapshot"}
|
|
1630
1822
|
view_properties = ViewPropertiesClass(
|
|
@@ -1705,6 +1897,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1705
1897
|
self.config.tag_prefix,
|
|
1706
1898
|
"SOURCE_CONTROL",
|
|
1707
1899
|
self.config.strip_user_ids_from_email,
|
|
1900
|
+
match_nested_props=True,
|
|
1708
1901
|
)
|
|
1709
1902
|
|
|
1710
1903
|
canonical_schema: List[SchemaField] = []
|
|
@@ -2053,5 +2246,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
2053
2246
|
term_id_set.add(existing_term.urn)
|
|
2054
2247
|
return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
|
|
2055
2248
|
|
|
2249
|
+
def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
|
|
2250
|
+
"""
|
|
2251
|
+
Determines whether to emit sibling relationships for a dbt node.
|
|
2252
|
+
|
|
2253
|
+
Sibling relationships (both dbt entity's aspect and target entity's patch) are only
|
|
2254
|
+
emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
|
|
2255
|
+
relationships. When dbt_is_primary_sibling=True,
|
|
2256
|
+
the SiblingAssociationHook handles sibling creation automatically.
|
|
2257
|
+
|
|
2258
|
+
Args:
|
|
2259
|
+
node: The dbt node to evaluate
|
|
2260
|
+
|
|
2261
|
+
Returns:
|
|
2262
|
+
True if sibling patches should be emitted for this node
|
|
2263
|
+
"""
|
|
2264
|
+
# Only create siblings for entities that exist in target platform
|
|
2265
|
+
if not node.exists_in_target_platform:
|
|
2266
|
+
return False
|
|
2267
|
+
|
|
2268
|
+
# Only emit patches when explicit primary/secondary control is needed
|
|
2269
|
+
return self.config.dbt_is_primary_sibling is False
|
|
2270
|
+
|
|
2056
2271
|
def get_report(self):
|
|
2057
2272
|
return self.report
|
|
@@ -15,7 +15,9 @@ from datahub.configuration.git import GitReference
|
|
|
15
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
16
|
from datahub.ingestion.api.common import PipelineContext
|
|
17
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
18
19
|
SupportStatus,
|
|
20
|
+
capability,
|
|
19
21
|
config_class,
|
|
20
22
|
platform_name,
|
|
21
23
|
support_status,
|
|
@@ -464,6 +466,7 @@ def load_run_results(
|
|
|
464
466
|
@platform_name("dbt")
|
|
465
467
|
@config_class(DBTCoreConfig)
|
|
466
468
|
@support_status(SupportStatus.CERTIFIED)
|
|
469
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
467
470
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
468
471
|
config: DBTCoreConfig
|
|
469
472
|
report: DBTCoreReport
|
|
File without changes
|