acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ from abc import abstractmethod
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from enum import auto
|
|
7
|
-
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import more_itertools
|
|
10
10
|
import pydantic
|
|
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
|
|
|
91
91
|
OwnershipClass,
|
|
92
92
|
OwnershipSourceTypeClass,
|
|
93
93
|
OwnershipTypeClass,
|
|
94
|
+
SiblingsClass,
|
|
94
95
|
StatusClass,
|
|
95
96
|
SubTypesClass,
|
|
96
97
|
TagAssociationClass,
|
|
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
|
|
|
98
99
|
ViewPropertiesClass,
|
|
99
100
|
)
|
|
100
101
|
from datahub.metadata.urns import DatasetUrn
|
|
102
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
101
103
|
from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
|
|
102
104
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
103
105
|
SqlParsingDebugInfo,
|
|
@@ -120,17 +122,25 @@ logger = logging.getLogger(__name__)
|
|
|
120
122
|
DBT_PLATFORM = "dbt"
|
|
121
123
|
|
|
122
124
|
_DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
|
125
|
+
_DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
@dataclass
|
|
126
129
|
class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
127
130
|
sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
|
|
131
|
+
sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
|
|
128
132
|
sql_parser_parse_failures: int = 0
|
|
129
133
|
sql_parser_detach_ctes_failures: int = 0
|
|
130
134
|
sql_parser_table_errors: int = 0
|
|
131
135
|
sql_parser_column_errors: int = 0
|
|
132
136
|
sql_parser_successes: int = 0
|
|
133
137
|
|
|
138
|
+
# Details on where column info comes from.
|
|
139
|
+
nodes_with_catalog_columns: int = 0
|
|
140
|
+
nodes_with_inferred_columns: int = 0
|
|
141
|
+
nodes_with_graph_columns: int = 0
|
|
142
|
+
nodes_with_no_columns: int = 0
|
|
143
|
+
|
|
134
144
|
sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
|
|
135
145
|
sql_parser_detach_ctes_failures_list: LossyList[str] = field(
|
|
136
146
|
default_factory=LossyList
|
|
@@ -138,6 +148,9 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
|
138
148
|
|
|
139
149
|
nodes_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
140
150
|
|
|
151
|
+
duplicate_sources_dropped: Optional[int] = None
|
|
152
|
+
duplicate_sources_references_updated: Optional[int] = None
|
|
153
|
+
|
|
141
154
|
|
|
142
155
|
class EmitDirective(ConfigEnum):
|
|
143
156
|
"""A holder for directives for emission for specific types of entities"""
|
|
@@ -233,6 +246,23 @@ class DBTEntitiesEnabled(ConfigModel):
|
|
|
233
246
|
return self.model_performance == EmitDirective.YES
|
|
234
247
|
|
|
235
248
|
|
|
249
|
+
class MaterializedNodePatternConfig(ConfigModel):
|
|
250
|
+
"""Configuration for filtering materialized nodes based on their physical location"""
|
|
251
|
+
|
|
252
|
+
database_pattern: AllowDenyPattern = Field(
|
|
253
|
+
default=AllowDenyPattern.allow_all(),
|
|
254
|
+
description="Regex patterns for database names to filter materialized nodes.",
|
|
255
|
+
)
|
|
256
|
+
schema_pattern: AllowDenyPattern = Field(
|
|
257
|
+
default=AllowDenyPattern.allow_all(),
|
|
258
|
+
description="Regex patterns for schema names in format '{database}.{schema}' to filter materialized nodes.",
|
|
259
|
+
)
|
|
260
|
+
table_pattern: AllowDenyPattern = Field(
|
|
261
|
+
default=AllowDenyPattern.allow_all(),
|
|
262
|
+
description="Regex patterns for table/view names in format '{database}.{schema}.{table}' to filter materialized nodes.",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
236
266
|
class DBTCommonConfig(
|
|
237
267
|
StatefulIngestionConfigBase,
|
|
238
268
|
PlatformInstanceConfigMixin,
|
|
@@ -281,6 +311,11 @@ class DBTCommonConfig(
|
|
|
281
311
|
default=AllowDenyPattern.allow_all(),
|
|
282
312
|
description="regex patterns for dbt model names to filter in ingestion.",
|
|
283
313
|
)
|
|
314
|
+
materialized_node_pattern: MaterializedNodePatternConfig = Field(
|
|
315
|
+
default=MaterializedNodePatternConfig(),
|
|
316
|
+
description="Advanced filtering for materialized nodes based on their physical database location. "
|
|
317
|
+
"Provides fine-grained control over database.schema.table patterns for catalog consistency.",
|
|
318
|
+
)
|
|
284
319
|
meta_mapping: Dict = Field(
|
|
285
320
|
default={},
|
|
286
321
|
description="mapping rules that will be executed against dbt meta properties. Refer to the section below on dbt meta automated mappings.",
|
|
@@ -348,7 +383,7 @@ class DBTCommonConfig(
|
|
|
348
383
|
# override default value to True.
|
|
349
384
|
incremental_lineage: bool = Field(
|
|
350
385
|
default=True,
|
|
351
|
-
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
|
|
386
|
+
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
|
|
352
387
|
)
|
|
353
388
|
|
|
354
389
|
_remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
|
|
@@ -363,6 +398,20 @@ class DBTCommonConfig(
|
|
|
363
398
|
"Set to False to skip it for engines like AWS Athena where it's not required.",
|
|
364
399
|
)
|
|
365
400
|
|
|
401
|
+
dbt_is_primary_sibling: bool = Field(
|
|
402
|
+
default=True,
|
|
403
|
+
description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
|
|
404
|
+
"When True (default), dbt entities are primary and target platform entities are secondary. "
|
|
405
|
+
"When False, target platform entities are primary and dbt entities are secondary. "
|
|
406
|
+
"Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
drop_duplicate_sources: bool = Field(
|
|
410
|
+
default=True,
|
|
411
|
+
description="When enabled, drops sources that have the same name in the target platform as a model. "
|
|
412
|
+
"This ensures that lineage is generated reliably, but will lose any documentation associated only with the source.",
|
|
413
|
+
)
|
|
414
|
+
|
|
366
415
|
@validator("target_platform")
|
|
367
416
|
def validate_target_platform_value(cls, target_platform: str) -> str:
|
|
368
417
|
if target_platform.lower() == DBT_PLATFORM:
|
|
@@ -502,7 +551,7 @@ class DBTNode:
|
|
|
502
551
|
raw_code: Optional[str]
|
|
503
552
|
|
|
504
553
|
dbt_adapter: str
|
|
505
|
-
dbt_name: str
|
|
554
|
+
dbt_name: str # dbt unique identifier
|
|
506
555
|
dbt_file_path: Optional[str]
|
|
507
556
|
dbt_package_name: Optional[str] # this is pretty much always present
|
|
508
557
|
|
|
@@ -618,14 +667,8 @@ class DBTNode:
|
|
|
618
667
|
def exists_in_target_platform(self):
|
|
619
668
|
return not (self.is_ephemeral_model() or self.node_type == "test")
|
|
620
669
|
|
|
621
|
-
def
|
|
622
|
-
"""
|
|
623
|
-
Update the column list if they are not already set.
|
|
624
|
-
"""
|
|
625
|
-
|
|
626
|
-
if self.columns:
|
|
627
|
-
# If we already have columns, don't overwrite them.
|
|
628
|
-
return
|
|
670
|
+
def set_columns(self, schema_fields: List[SchemaField]) -> None:
|
|
671
|
+
"""Update the column list."""
|
|
629
672
|
|
|
630
673
|
self.columns = [
|
|
631
674
|
DBTColumn(
|
|
@@ -822,18 +865,22 @@ def get_column_type(
|
|
|
822
865
|
@platform_name("dbt")
|
|
823
866
|
@config_class(DBTCommonConfig)
|
|
824
867
|
@support_status(SupportStatus.CERTIFIED)
|
|
825
|
-
@capability(
|
|
868
|
+
@capability(
|
|
869
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
870
|
+
)
|
|
826
871
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
827
872
|
@capability(
|
|
828
873
|
SourceCapability.LINEAGE_FINE,
|
|
829
874
|
"Enabled by default, configure using `include_column_lineage`",
|
|
830
875
|
)
|
|
831
876
|
class DBTSourceBase(StatefulIngestionSourceBase):
|
|
832
|
-
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext
|
|
877
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
833
878
|
super().__init__(config, ctx)
|
|
879
|
+
self.platform: str = "dbt"
|
|
880
|
+
|
|
834
881
|
self.config = config
|
|
835
|
-
self.platform: str = platform
|
|
836
882
|
self.report: DBTSourceReport = DBTSourceReport()
|
|
883
|
+
|
|
837
884
|
self.compiled_owner_extraction_pattern: Optional[Any] = None
|
|
838
885
|
if self.config.owner_extraction_pattern:
|
|
839
886
|
self.compiled_owner_extraction_pattern = re.compile(
|
|
@@ -849,7 +896,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
849
896
|
test_nodes: List[DBTNode],
|
|
850
897
|
extra_custom_props: Dict[str, str],
|
|
851
898
|
all_nodes_map: Dict[str, DBTNode],
|
|
852
|
-
) -> Iterable[
|
|
899
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
853
900
|
for node in sorted(test_nodes, key=lambda n: n.dbt_name):
|
|
854
901
|
upstreams = get_upstreams_for_test(
|
|
855
902
|
test_node=node,
|
|
@@ -902,7 +949,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
902
949
|
yield MetadataChangeProposalWrapper(
|
|
903
950
|
entityUrn=assertion_urn,
|
|
904
951
|
aspect=self._make_data_platform_instance_aspect(),
|
|
905
|
-
)
|
|
952
|
+
)
|
|
906
953
|
|
|
907
954
|
yield make_assertion_from_test(
|
|
908
955
|
custom_props,
|
|
@@ -949,7 +996,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
949
996
|
),
|
|
950
997
|
)
|
|
951
998
|
|
|
952
|
-
def get_workunits_internal(
|
|
999
|
+
def get_workunits_internal(
|
|
1000
|
+
self,
|
|
1001
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
953
1002
|
if self.config.write_semantics == "PATCH":
|
|
954
1003
|
self.ctx.require_graph("Using dbt with write_semantics=PATCH")
|
|
955
1004
|
|
|
@@ -968,6 +1017,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
968
1017
|
self._infer_schemas_and_update_cll(all_nodes_map)
|
|
969
1018
|
|
|
970
1019
|
nodes = self._filter_nodes(all_nodes)
|
|
1020
|
+
nodes = self._drop_duplicate_sources(nodes)
|
|
1021
|
+
|
|
971
1022
|
non_test_nodes = [
|
|
972
1023
|
dataset_node for dataset_node in nodes if dataset_node.node_type != "test"
|
|
973
1024
|
]
|
|
@@ -989,15 +1040,53 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
989
1040
|
all_nodes_map,
|
|
990
1041
|
)
|
|
991
1042
|
|
|
992
|
-
def _is_allowed_node(self,
|
|
993
|
-
|
|
1043
|
+
def _is_allowed_node(self, node: DBTNode) -> bool:
|
|
1044
|
+
"""
|
|
1045
|
+
Check whether a node should be processed, using multi-layer rules. Checks for materialized nodes might need to be restricted in the future to some cases
|
|
1046
|
+
"""
|
|
1047
|
+
if not self.config.node_name_pattern.allowed(node.dbt_name):
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
if not self._is_allowed_materialized_node(node):
|
|
1051
|
+
return False
|
|
1052
|
+
|
|
1053
|
+
return True
|
|
1054
|
+
|
|
1055
|
+
def _is_allowed_materialized_node(self, node: DBTNode) -> bool:
|
|
1056
|
+
"""Filter nodes based on their materialized database location for catalog consistency"""
|
|
1057
|
+
|
|
1058
|
+
# Database level filtering
|
|
1059
|
+
if not node.database:
|
|
1060
|
+
return True
|
|
1061
|
+
if not self.config.materialized_node_pattern.database_pattern.allowed(
|
|
1062
|
+
node.database
|
|
1063
|
+
):
|
|
1064
|
+
return False
|
|
1065
|
+
|
|
1066
|
+
# Schema level filtering: {database}.{schema}
|
|
1067
|
+
if not node.schema:
|
|
1068
|
+
return True
|
|
1069
|
+
if not self.config.materialized_node_pattern.schema_pattern.allowed(
|
|
1070
|
+
node._join_parts([node.database, node.schema])
|
|
1071
|
+
):
|
|
1072
|
+
return False
|
|
1073
|
+
|
|
1074
|
+
# Table level filtering: {database}.{schema}.{table}
|
|
1075
|
+
if not node.name:
|
|
1076
|
+
return True
|
|
1077
|
+
if not self.config.materialized_node_pattern.table_pattern.allowed(
|
|
1078
|
+
node.get_db_fqn()
|
|
1079
|
+
):
|
|
1080
|
+
return False
|
|
1081
|
+
|
|
1082
|
+
return True
|
|
994
1083
|
|
|
995
1084
|
def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]:
|
|
996
|
-
nodes = []
|
|
1085
|
+
nodes: List[DBTNode] = []
|
|
997
1086
|
for node in all_nodes:
|
|
998
1087
|
key = node.dbt_name
|
|
999
1088
|
|
|
1000
|
-
if not self._is_allowed_node(
|
|
1089
|
+
if not self._is_allowed_node(node):
|
|
1001
1090
|
self.report.nodes_filtered.append(key)
|
|
1002
1091
|
continue
|
|
1003
1092
|
|
|
@@ -1005,6 +1094,62 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1005
1094
|
|
|
1006
1095
|
return nodes
|
|
1007
1096
|
|
|
1097
|
+
def _drop_duplicate_sources(self, original_nodes: List[DBTNode]) -> List[DBTNode]:
|
|
1098
|
+
"""Detect and correct cases where a model and source have the same name.
|
|
1099
|
+
|
|
1100
|
+
In these cases, we don't want to generate both because they'll have the same
|
|
1101
|
+
urn and hence overwrite each other. Instead, we drop the source and update
|
|
1102
|
+
references to it to point at the model.
|
|
1103
|
+
|
|
1104
|
+
The risk here is that the source might have documentation that'd be lost,
|
|
1105
|
+
which is why we maintain optionality with a config flag.
|
|
1106
|
+
"""
|
|
1107
|
+
if not self.config.drop_duplicate_sources:
|
|
1108
|
+
return original_nodes
|
|
1109
|
+
|
|
1110
|
+
self.report.duplicate_sources_dropped = 0
|
|
1111
|
+
self.report.duplicate_sources_references_updated = 0
|
|
1112
|
+
|
|
1113
|
+
# Pass 1 - find all model names in the warehouse.
|
|
1114
|
+
warehouse_model_names: Dict[str, str] = {} # warehouse name -> model unique id
|
|
1115
|
+
for node in original_nodes:
|
|
1116
|
+
if node.node_type == "model" and node.exists_in_target_platform:
|
|
1117
|
+
warehouse_model_names[node.get_db_fqn()] = node.dbt_name
|
|
1118
|
+
|
|
1119
|
+
# Pass 2 - identify + drop duplicate sources.
|
|
1120
|
+
source_references_to_update: Dict[
|
|
1121
|
+
str, str
|
|
1122
|
+
] = {} # source unique id -> model unique id
|
|
1123
|
+
nodes: List[DBTNode] = []
|
|
1124
|
+
for node in original_nodes:
|
|
1125
|
+
if (
|
|
1126
|
+
node.node_type == "source"
|
|
1127
|
+
and node.exists_in_target_platform
|
|
1128
|
+
and (model_name := warehouse_model_names.get(node.get_db_fqn()))
|
|
1129
|
+
):
|
|
1130
|
+
self.report.warning(
|
|
1131
|
+
title="Duplicate model and source names detected",
|
|
1132
|
+
message="We found a dbt model and dbt source with the same name. To ensure reliable lineage generation, the source node was ignored. "
|
|
1133
|
+
"If you associated documentation/tags/other metadata with the source, it will be lost. "
|
|
1134
|
+
"To avoid this, you should remove the source node from your dbt project and replace any `source(<source_name>)` calls with `ref(<model_name>)`.",
|
|
1135
|
+
context=f"{node.dbt_name} (called {node.get_db_fqn()} in {self.config.target_platform}) duplicates {model_name}",
|
|
1136
|
+
)
|
|
1137
|
+
self.report.duplicate_sources_dropped += 1
|
|
1138
|
+
source_references_to_update[node.dbt_name] = model_name
|
|
1139
|
+
else:
|
|
1140
|
+
nodes.append(node)
|
|
1141
|
+
|
|
1142
|
+
# Pass 3 - update references to the dropped sources.
|
|
1143
|
+
for node in nodes:
|
|
1144
|
+
for i, current_upstream in enumerate(node.upstream_nodes):
|
|
1145
|
+
if current_upstream in source_references_to_update:
|
|
1146
|
+
node.upstream_nodes[i] = source_references_to_update[
|
|
1147
|
+
current_upstream
|
|
1148
|
+
]
|
|
1149
|
+
self.report.duplicate_sources_references_updated += 1
|
|
1150
|
+
|
|
1151
|
+
return nodes
|
|
1152
|
+
|
|
1008
1153
|
@staticmethod
|
|
1009
1154
|
def _to_schema_info(schema_fields: List[SchemaField]) -> SchemaInfo:
|
|
1010
1155
|
return {column.fieldPath: column.nativeDataType for column in schema_fields}
|
|
@@ -1033,8 +1178,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1033
1178
|
cll_nodes.add(dbt_name)
|
|
1034
1179
|
schema_nodes.add(dbt_name)
|
|
1035
1180
|
|
|
1036
|
-
for dbt_name in all_nodes_map:
|
|
1037
|
-
if self._is_allowed_node(
|
|
1181
|
+
for dbt_name, dbt_node in all_nodes_map.items():
|
|
1182
|
+
if self._is_allowed_node(dbt_node):
|
|
1038
1183
|
add_node_to_cll_list(dbt_name)
|
|
1039
1184
|
|
|
1040
1185
|
return schema_nodes, cll_nodes
|
|
@@ -1175,6 +1320,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1175
1320
|
logger.debug(
|
|
1176
1321
|
f"Not generating CLL for {node.dbt_name} because we don't need it."
|
|
1177
1322
|
)
|
|
1323
|
+
elif node.language != "sql":
|
|
1324
|
+
logger.debug(
|
|
1325
|
+
f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
|
|
1326
|
+
)
|
|
1327
|
+
self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
|
|
1178
1328
|
elif node.compiled_code:
|
|
1179
1329
|
# Add CTE stops based on the upstreams list.
|
|
1180
1330
|
cte_mapping = {
|
|
@@ -1238,9 +1388,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1238
1388
|
target_node_urn, self._to_schema_info(inferred_schema_fields)
|
|
1239
1389
|
)
|
|
1240
1390
|
|
|
1241
|
-
#
|
|
1242
|
-
|
|
1243
|
-
|
|
1391
|
+
# When updating the node's columns, our order of preference is:
|
|
1392
|
+
# 1. Schema from the dbt catalog
|
|
1393
|
+
# 2. Inferred schema
|
|
1394
|
+
# 3. Schema fetched from the graph
|
|
1395
|
+
if node.columns:
|
|
1396
|
+
self.report.nodes_with_catalog_columns += 1
|
|
1397
|
+
pass # we already have columns from the dbt catalog
|
|
1398
|
+
elif inferred_schema_fields:
|
|
1399
|
+
logger.debug(
|
|
1400
|
+
f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
|
|
1401
|
+
)
|
|
1402
|
+
self.report.nodes_with_inferred_columns += 1
|
|
1403
|
+
node.set_columns(inferred_schema_fields)
|
|
1404
|
+
elif schema_fields:
|
|
1405
|
+
logger.debug(
|
|
1406
|
+
f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
|
|
1407
|
+
)
|
|
1408
|
+
self.report.nodes_with_graph_columns += 1
|
|
1409
|
+
node.set_columns(schema_fields)
|
|
1410
|
+
else:
|
|
1411
|
+
logger.debug(f"No columns found for {node.dbt_name}")
|
|
1412
|
+
self.report.nodes_with_no_columns += 1
|
|
1244
1413
|
|
|
1245
1414
|
def _parse_cll(
|
|
1246
1415
|
self,
|
|
@@ -1305,6 +1474,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1305
1474
|
self.config.tag_prefix,
|
|
1306
1475
|
"SOURCE_CONTROL",
|
|
1307
1476
|
self.config.strip_user_ids_from_email,
|
|
1477
|
+
match_nested_props=True,
|
|
1308
1478
|
)
|
|
1309
1479
|
|
|
1310
1480
|
action_processor_tag = OperationProcessor(
|
|
@@ -1376,6 +1546,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1376
1546
|
dataset_snapshot = DatasetSnapshot(
|
|
1377
1547
|
urn=node_datahub_urn, aspects=list(snapshot_aspects)
|
|
1378
1548
|
)
|
|
1549
|
+
# Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
|
|
1550
|
+
if self._should_create_sibling_relationships(node):
|
|
1551
|
+
# Get the target platform URN
|
|
1552
|
+
target_platform_urn = node.get_urn(
|
|
1553
|
+
self.config.target_platform,
|
|
1554
|
+
self.config.env,
|
|
1555
|
+
self.config.target_platform_instance,
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
yield MetadataChangeProposalWrapper(
|
|
1559
|
+
entityUrn=node_datahub_urn,
|
|
1560
|
+
aspect=SiblingsClass(
|
|
1561
|
+
siblings=[target_platform_urn],
|
|
1562
|
+
primary=self.config.dbt_is_primary_sibling,
|
|
1563
|
+
),
|
|
1564
|
+
).as_workunit()
|
|
1565
|
+
|
|
1379
1566
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1380
1567
|
if self.config.write_semantics == "PATCH":
|
|
1381
1568
|
mce = self.get_patched_mce(mce)
|
|
@@ -1479,6 +1666,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1479
1666
|
if not node.exists_in_target_platform:
|
|
1480
1667
|
continue
|
|
1481
1668
|
|
|
1669
|
+
# Emit sibling patch for target platform entity BEFORE any other aspects.
|
|
1670
|
+
# This ensures the hook can detect explicit primary settings when processing later aspects.
|
|
1671
|
+
if self._should_create_sibling_relationships(node):
|
|
1672
|
+
# Get the dbt platform URN
|
|
1673
|
+
dbt_platform_urn = node.get_urn(
|
|
1674
|
+
DBT_PLATFORM,
|
|
1675
|
+
self.config.env,
|
|
1676
|
+
self.config.platform_instance,
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
# Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
|
|
1680
|
+
target_patch = DatasetPatchBuilder(node_datahub_urn)
|
|
1681
|
+
target_patch.add_sibling(
|
|
1682
|
+
dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
|
|
1683
|
+
)
|
|
1684
|
+
|
|
1685
|
+
yield from auto_workunit(
|
|
1686
|
+
MetadataWorkUnit(
|
|
1687
|
+
id=MetadataWorkUnit.generate_workunit_id(mcp),
|
|
1688
|
+
mcp_raw=mcp,
|
|
1689
|
+
is_primary_source=False, # Not authoritative over warehouse metadata
|
|
1690
|
+
)
|
|
1691
|
+
for mcp in target_patch.build()
|
|
1692
|
+
)
|
|
1693
|
+
|
|
1482
1694
|
# This code block is run when we are generating entities of platform type.
|
|
1483
1695
|
# We will not link the platform not to the dbt node for type "source" because
|
|
1484
1696
|
# in this case the platform table existed first.
|
|
@@ -1585,6 +1797,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1585
1797
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
|
1586
1798
|
pass
|
|
1587
1799
|
|
|
1800
|
+
@staticmethod
|
|
1801
|
+
def _truncate_code(code: str, max_length: int) -> str:
|
|
1802
|
+
if len(code) > max_length:
|
|
1803
|
+
return code[:max_length] + "..."
|
|
1804
|
+
return code
|
|
1805
|
+
|
|
1588
1806
|
def _create_view_properties_aspect(
|
|
1589
1807
|
self, node: DBTNode
|
|
1590
1808
|
) -> Optional[ViewPropertiesClass]:
|
|
@@ -1596,6 +1814,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1596
1814
|
compiled_code = try_format_query(
|
|
1597
1815
|
node.compiled_code, platform=self.config.target_platform
|
|
1598
1816
|
)
|
|
1817
|
+
compiled_code = self._truncate_code(
|
|
1818
|
+
compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
|
|
1819
|
+
)
|
|
1599
1820
|
|
|
1600
1821
|
materialized = node.materialization in {"table", "incremental", "snapshot"}
|
|
1601
1822
|
view_properties = ViewPropertiesClass(
|
|
@@ -1676,6 +1897,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1676
1897
|
self.config.tag_prefix,
|
|
1677
1898
|
"SOURCE_CONTROL",
|
|
1678
1899
|
self.config.strip_user_ids_from_email,
|
|
1900
|
+
match_nested_props=True,
|
|
1679
1901
|
)
|
|
1680
1902
|
|
|
1681
1903
|
canonical_schema: List[SchemaField] = []
|
|
@@ -2024,5 +2246,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
2024
2246
|
term_id_set.add(existing_term.urn)
|
|
2025
2247
|
return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
|
|
2026
2248
|
|
|
2249
|
+
def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
|
|
2250
|
+
"""
|
|
2251
|
+
Determines whether to emit sibling relationships for a dbt node.
|
|
2252
|
+
|
|
2253
|
+
Sibling relationships (both dbt entity's aspect and target entity's patch) are only
|
|
2254
|
+
emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
|
|
2255
|
+
relationships. When dbt_is_primary_sibling=True,
|
|
2256
|
+
the SiblingAssociationHook handles sibling creation automatically.
|
|
2257
|
+
|
|
2258
|
+
Args:
|
|
2259
|
+
node: The dbt node to evaluate
|
|
2260
|
+
|
|
2261
|
+
Returns:
|
|
2262
|
+
True if sibling patches should be emitted for this node
|
|
2263
|
+
"""
|
|
2264
|
+
# Only create siblings for entities that exist in target platform
|
|
2265
|
+
if not node.exists_in_target_platform:
|
|
2266
|
+
return False
|
|
2267
|
+
|
|
2268
|
+
# Only emit patches when explicit primary/secondary control is needed
|
|
2269
|
+
return self.config.dbt_is_primary_sibling is False
|
|
2270
|
+
|
|
2027
2271
|
def get_report(self):
|
|
2028
2272
|
return self.report
|