acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,943 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import (
|
|
7
|
+
TYPE_CHECKING,
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
List,
|
|
12
|
+
Literal,
|
|
13
|
+
Optional,
|
|
14
|
+
Set,
|
|
15
|
+
Union,
|
|
16
|
+
overload,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from typing_extensions import assert_never, deprecated
|
|
20
|
+
|
|
21
|
+
import datahub.metadata.schema_classes as models
|
|
22
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
23
|
+
from datahub.errors import SdkUsageError
|
|
24
|
+
from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn, SchemaFieldUrn, Urn
|
|
25
|
+
from datahub.sdk._shared import (
|
|
26
|
+
ChartUrnOrStr,
|
|
27
|
+
DashboardUrnOrStr,
|
|
28
|
+
DatajobUrnOrStr,
|
|
29
|
+
DatasetUrnOrStr,
|
|
30
|
+
)
|
|
31
|
+
from datahub.sdk._utils import DEFAULT_ACTOR_URN
|
|
32
|
+
from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
|
|
33
|
+
from datahub.sdk.search_client import compile_filters
|
|
34
|
+
from datahub.sdk.search_filters import Filter, FilterDsl
|
|
35
|
+
from datahub.specific.chart import ChartPatchBuilder
|
|
36
|
+
from datahub.specific.dashboard import DashboardPatchBuilder
|
|
37
|
+
from datahub.specific.datajob import DataJobPatchBuilder
|
|
38
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
39
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
40
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
41
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from datahub.sdk.main_client import DataHubClient
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
_empty_audit_stamp = models.AuditStampClass(
|
|
48
|
+
time=0,
|
|
49
|
+
actor=DEFAULT_ACTOR_URN,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class LineagePath:
|
|
58
|
+
urn: str
|
|
59
|
+
entity_name: str
|
|
60
|
+
column_name: Optional[str] = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class LineageResult:
|
|
65
|
+
urn: str
|
|
66
|
+
type: str
|
|
67
|
+
hops: int
|
|
68
|
+
direction: Literal["upstream", "downstream"]
|
|
69
|
+
platform: Optional[str] = None
|
|
70
|
+
name: Optional[str] = None
|
|
71
|
+
description: Optional[str] = None
|
|
72
|
+
paths: Optional[List[LineagePath]] = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class LineageClient:
|
|
76
|
+
def __init__(self, client: DataHubClient):
|
|
77
|
+
self._client = client
|
|
78
|
+
self._graph = client._graph
|
|
79
|
+
|
|
80
|
+
def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
|
|
81
|
+
schema_metadata = self._client._graph.get_aspect(
|
|
82
|
+
str(dataset_urn), models.SchemaMetadataClass
|
|
83
|
+
)
|
|
84
|
+
if schema_metadata is None:
|
|
85
|
+
return set()
|
|
86
|
+
|
|
87
|
+
return {field.fieldPath for field in schema_metadata.fields}
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def _get_strict_column_lineage(
|
|
91
|
+
cls,
|
|
92
|
+
upstream_fields: Set[str],
|
|
93
|
+
downstream_fields: Set[str],
|
|
94
|
+
) -> ColumnLineageMapping:
|
|
95
|
+
"""Find matches between upstream and downstream fields with case-insensitive matching."""
|
|
96
|
+
strict_column_lineage: ColumnLineageMapping = {}
|
|
97
|
+
|
|
98
|
+
# Create case-insensitive mapping of upstream fields
|
|
99
|
+
case_insensitive_map = {field.lower(): field for field in upstream_fields}
|
|
100
|
+
|
|
101
|
+
# Match downstream fields using case-insensitive comparison
|
|
102
|
+
for downstream_field in downstream_fields:
|
|
103
|
+
lower_field = downstream_field.lower()
|
|
104
|
+
if lower_field in case_insensitive_map:
|
|
105
|
+
# Use the original case of the upstream field
|
|
106
|
+
strict_column_lineage[downstream_field] = [
|
|
107
|
+
case_insensitive_map[lower_field]
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
return strict_column_lineage
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def _get_fuzzy_column_lineage(
|
|
114
|
+
cls,
|
|
115
|
+
upstream_fields: Set[str],
|
|
116
|
+
downstream_fields: Set[str],
|
|
117
|
+
) -> ColumnLineageMapping:
|
|
118
|
+
"""Generate fuzzy matches between upstream and downstream fields."""
|
|
119
|
+
|
|
120
|
+
# Simple normalization function for better matching
|
|
121
|
+
def normalize(s: str) -> str:
|
|
122
|
+
return s.lower().replace("_", "")
|
|
123
|
+
|
|
124
|
+
# Create normalized lookup for upstream fields
|
|
125
|
+
normalized_upstream = {normalize(field): field for field in upstream_fields}
|
|
126
|
+
|
|
127
|
+
fuzzy_column_lineage = {}
|
|
128
|
+
for downstream_field in downstream_fields:
|
|
129
|
+
# Try exact match first
|
|
130
|
+
if downstream_field in upstream_fields:
|
|
131
|
+
fuzzy_column_lineage[downstream_field] = [downstream_field]
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Try normalized match
|
|
135
|
+
norm_downstream = normalize(downstream_field)
|
|
136
|
+
if norm_downstream in normalized_upstream:
|
|
137
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
138
|
+
normalized_upstream[norm_downstream]
|
|
139
|
+
]
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# If no direct match, find closest match using similarity
|
|
143
|
+
matches = difflib.get_close_matches(
|
|
144
|
+
norm_downstream,
|
|
145
|
+
normalized_upstream.keys(),
|
|
146
|
+
n=1, # Return only the best match
|
|
147
|
+
cutoff=0.8, # Adjust cutoff for sensitivity
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if matches:
|
|
151
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
152
|
+
normalized_upstream[matches[0]]
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
return fuzzy_column_lineage
|
|
156
|
+
|
|
157
|
+
@overload
|
|
158
|
+
def add_lineage(
|
|
159
|
+
self,
|
|
160
|
+
*,
|
|
161
|
+
upstream: DatasetUrnOrStr,
|
|
162
|
+
downstream: DatasetUrnOrStr,
|
|
163
|
+
column_lineage: Union[
|
|
164
|
+
bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
165
|
+
] = False,
|
|
166
|
+
transformation_text: Optional[str] = None,
|
|
167
|
+
) -> None:
|
|
168
|
+
"""Add dataset-to-dataset lineage with column-level mapping."""
|
|
169
|
+
|
|
170
|
+
@overload
|
|
171
|
+
def add_lineage(
|
|
172
|
+
self,
|
|
173
|
+
*,
|
|
174
|
+
upstream: Union[DatajobUrnOrStr],
|
|
175
|
+
downstream: DatasetUrnOrStr,
|
|
176
|
+
) -> None:
|
|
177
|
+
"""Add dataset-to-datajob or dataset-to-mlmodel lineage."""
|
|
178
|
+
|
|
179
|
+
@overload
|
|
180
|
+
def add_lineage(
|
|
181
|
+
self,
|
|
182
|
+
*,
|
|
183
|
+
upstream: Union[DatasetUrnOrStr, DatajobUrnOrStr],
|
|
184
|
+
downstream: DatajobUrnOrStr,
|
|
185
|
+
) -> None:
|
|
186
|
+
"""Add datajob-to-dataset or datajob-to-datajob lineage."""
|
|
187
|
+
|
|
188
|
+
@overload
|
|
189
|
+
def add_lineage(
|
|
190
|
+
self,
|
|
191
|
+
*,
|
|
192
|
+
upstream: Union[DashboardUrnOrStr, DatasetUrnOrStr, ChartUrnOrStr],
|
|
193
|
+
downstream: DashboardUrnOrStr,
|
|
194
|
+
) -> None:
|
|
195
|
+
"""Add dashboard-to-dashboard or dashboard-to-dataset lineage."""
|
|
196
|
+
|
|
197
|
+
@overload
|
|
198
|
+
def add_lineage(
|
|
199
|
+
self,
|
|
200
|
+
*,
|
|
201
|
+
upstream: DatasetUrnOrStr,
|
|
202
|
+
downstream: ChartUrnOrStr,
|
|
203
|
+
) -> None:
|
|
204
|
+
"""Add dataset-to-chart lineage."""
|
|
205
|
+
|
|
206
|
+
# The actual implementation that handles all overloaded cases
|
|
207
|
+
def add_lineage(
|
|
208
|
+
self,
|
|
209
|
+
*,
|
|
210
|
+
upstream: Union[
|
|
211
|
+
DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
|
|
212
|
+
],
|
|
213
|
+
downstream: Union[
|
|
214
|
+
DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
|
|
215
|
+
],
|
|
216
|
+
column_lineage: Union[
|
|
217
|
+
bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
218
|
+
] = False,
|
|
219
|
+
transformation_text: Optional[str] = None,
|
|
220
|
+
) -> None:
|
|
221
|
+
"""Add lineage between two entities.
|
|
222
|
+
|
|
223
|
+
This flexible method handles different combinations of entity types:
|
|
224
|
+
- dataset to dataset
|
|
225
|
+
- dataset to datajob
|
|
226
|
+
- datajob to dataset
|
|
227
|
+
- datajob to datajob
|
|
228
|
+
- dashboard to dataset
|
|
229
|
+
- dashboard to chart
|
|
230
|
+
- dashboard to dashboard
|
|
231
|
+
- dataset to chart
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
upstream: URN of the upstream entity (dataset or datajob)
|
|
235
|
+
downstream: URN of the downstream entity (dataset or datajob)
|
|
236
|
+
column_lineage: Optional boolean to indicate if column-level lineage should be added or a lineage mapping type (auto_fuzzy, auto_strict, or a mapping of column-level lineage)
|
|
237
|
+
transformation_text: Optional SQL query text that defines the transformation
|
|
238
|
+
(only applicable for dataset-to-dataset lineage)
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
InvalidUrnError: If the URNs provided are invalid
|
|
242
|
+
SdkUsageError: If certain parameter combinations are not supported
|
|
243
|
+
"""
|
|
244
|
+
# Validate parameter combinations
|
|
245
|
+
upstream_entity_type = Urn.from_string(upstream).entity_type
|
|
246
|
+
downstream_entity_type = Urn.from_string(downstream).entity_type
|
|
247
|
+
|
|
248
|
+
key = (upstream_entity_type, downstream_entity_type)
|
|
249
|
+
|
|
250
|
+
# if it's not dataset-dataset lineage but provided with column_lineage or transformation_text, raise an error
|
|
251
|
+
if key != ("dataset", "dataset") and (column_lineage or transformation_text):
|
|
252
|
+
raise SdkUsageError(
|
|
253
|
+
"Column lineage and query text are only applicable for dataset-to-dataset lineage"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
lineage_handlers: dict[tuple[str, str], Callable] = {
|
|
257
|
+
("dataset", "dataset"): self._add_dataset_lineage,
|
|
258
|
+
("dataset", "dashboard"): self._add_dashboard_lineage,
|
|
259
|
+
("chart", "dashboard"): self._add_dashboard_lineage,
|
|
260
|
+
("dashboard", "dashboard"): self._add_dashboard_lineage,
|
|
261
|
+
("dataset", "dataJob"): self._add_datajob_lineage,
|
|
262
|
+
("dataJob", "dataJob"): self._add_datajob_lineage,
|
|
263
|
+
("dataJob", "dataset"): self._add_datajob_output,
|
|
264
|
+
("dataset", "chart"): self._add_chart_lineage,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
lineage_handler = lineage_handlers[key]
|
|
269
|
+
lineage_handler(
|
|
270
|
+
upstream=upstream,
|
|
271
|
+
downstream=downstream,
|
|
272
|
+
upstream_type=upstream_entity_type,
|
|
273
|
+
column_lineage=column_lineage,
|
|
274
|
+
transformation_text=transformation_text,
|
|
275
|
+
)
|
|
276
|
+
except KeyError:
|
|
277
|
+
raise SdkUsageError(
|
|
278
|
+
f"Unsupported entity type combination: {upstream_entity_type} -> {downstream_entity_type}"
|
|
279
|
+
) from None
|
|
280
|
+
|
|
281
|
+
def _add_dataset_lineage(
|
|
282
|
+
self,
|
|
283
|
+
*,
|
|
284
|
+
upstream,
|
|
285
|
+
downstream,
|
|
286
|
+
column_lineage,
|
|
287
|
+
transformation_text,
|
|
288
|
+
**_,
|
|
289
|
+
):
|
|
290
|
+
upstream_urn = DatasetUrn.from_string(upstream)
|
|
291
|
+
downstream_urn = DatasetUrn.from_string(downstream)
|
|
292
|
+
|
|
293
|
+
if column_lineage:
|
|
294
|
+
column_lineage = (
|
|
295
|
+
"auto_fuzzy" if column_lineage is True else column_lineage
|
|
296
|
+
) # if column_lineage is True, set it to auto_fuzzy
|
|
297
|
+
cll = self._process_column_lineage(
|
|
298
|
+
column_lineage, upstream_urn, downstream_urn
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
cll = None
|
|
302
|
+
|
|
303
|
+
if transformation_text:
|
|
304
|
+
self._process_transformation_lineage(
|
|
305
|
+
transformation_text, upstream_urn, downstream_urn, cll
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
updater = DatasetPatchBuilder(str(downstream_urn))
|
|
309
|
+
updater.add_upstream_lineage(
|
|
310
|
+
models.UpstreamClass(
|
|
311
|
+
dataset=str(upstream_urn),
|
|
312
|
+
type=models.DatasetLineageTypeClass.COPY,
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
for cl in cll or []:
|
|
316
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
317
|
+
self._client.entities.update(updater)
|
|
318
|
+
|
|
319
|
+
def _add_dashboard_lineage(self, *, upstream, downstream, upstream_type, **_):
|
|
320
|
+
patch = DashboardPatchBuilder(str(downstream))
|
|
321
|
+
if upstream_type == "dataset":
|
|
322
|
+
patch.add_dataset_edge(upstream)
|
|
323
|
+
elif upstream_type == "chart":
|
|
324
|
+
patch.add_chart_edge(upstream)
|
|
325
|
+
elif upstream_type == "dashboard":
|
|
326
|
+
patch.add_dashboard(upstream)
|
|
327
|
+
else:
|
|
328
|
+
raise SdkUsageError(
|
|
329
|
+
f"Unsupported entity type combination: {upstream_type} -> dashboard"
|
|
330
|
+
)
|
|
331
|
+
self._client.entities.update(patch)
|
|
332
|
+
|
|
333
|
+
def _add_datajob_lineage(self, *, upstream, downstream, upstream_type, **_):
|
|
334
|
+
patch = DataJobPatchBuilder(str(downstream))
|
|
335
|
+
if upstream_type == "dataset":
|
|
336
|
+
patch.add_input_dataset(upstream)
|
|
337
|
+
elif upstream_type == "dataJob":
|
|
338
|
+
patch.add_input_datajob(upstream)
|
|
339
|
+
else:
|
|
340
|
+
raise SdkUsageError(
|
|
341
|
+
f"Unsupported entity type combination: {upstream_type} -> dataJob"
|
|
342
|
+
)
|
|
343
|
+
self._client.entities.update(patch)
|
|
344
|
+
|
|
345
|
+
def _add_datajob_output(self, *, upstream, downstream, **_):
|
|
346
|
+
patch = DataJobPatchBuilder(str(upstream))
|
|
347
|
+
patch.add_output_dataset(downstream)
|
|
348
|
+
self._client.entities.update(patch)
|
|
349
|
+
|
|
350
|
+
def _add_chart_lineage(self, *, upstream, downstream, **_):
|
|
351
|
+
patch = ChartPatchBuilder(str(downstream))
|
|
352
|
+
patch.add_input_edge(upstream)
|
|
353
|
+
self._client.entities.update(patch)
|
|
354
|
+
|
|
355
|
+
def _process_column_lineage(self, column_lineage, upstream_urn, downstream_urn):
|
|
356
|
+
cll = None
|
|
357
|
+
if column_lineage:
|
|
358
|
+
# Auto column lineage generation
|
|
359
|
+
if column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
|
|
360
|
+
upstream_schema = self._get_fields_from_dataset_urn(upstream_urn)
|
|
361
|
+
downstream_schema = self._get_fields_from_dataset_urn(downstream_urn)
|
|
362
|
+
|
|
363
|
+
# Choose matching strategy
|
|
364
|
+
mapping = (
|
|
365
|
+
self._get_fuzzy_column_lineage(upstream_schema, downstream_schema)
|
|
366
|
+
if column_lineage == "auto_fuzzy"
|
|
367
|
+
else self._get_strict_column_lineage(
|
|
368
|
+
upstream_schema, downstream_schema
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
cll = parse_cll_mapping(
|
|
372
|
+
upstream=upstream_urn,
|
|
373
|
+
downstream=downstream_urn,
|
|
374
|
+
cll_mapping=mapping,
|
|
375
|
+
)
|
|
376
|
+
# Explicit column lineage
|
|
377
|
+
elif isinstance(column_lineage, dict):
|
|
378
|
+
cll = parse_cll_mapping(
|
|
379
|
+
upstream=upstream_urn,
|
|
380
|
+
downstream=downstream_urn,
|
|
381
|
+
cll_mapping=column_lineage,
|
|
382
|
+
)
|
|
383
|
+
else:
|
|
384
|
+
assert_never(column_lineage)
|
|
385
|
+
return cll
|
|
386
|
+
|
|
387
|
+
def _process_transformation_lineage(
|
|
388
|
+
self, transformation_text, upstream_urn, downstream_urn, cll
|
|
389
|
+
):
|
|
390
|
+
fields_involved = OrderedSet([str(upstream_urn), str(downstream_urn)])
|
|
391
|
+
if cll is not None:
|
|
392
|
+
for c in cll:
|
|
393
|
+
for field in c.upstreams or []:
|
|
394
|
+
fields_involved.add(field)
|
|
395
|
+
for field in c.downstreams or []:
|
|
396
|
+
fields_involved.add(field)
|
|
397
|
+
|
|
398
|
+
# Create query URN and entity
|
|
399
|
+
query_urn = QueryUrn(generate_hash(transformation_text)).urn()
|
|
400
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
401
|
+
make_query_subjects,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
query_entity = MetadataChangeProposalWrapper.construct_many(
|
|
405
|
+
query_urn,
|
|
406
|
+
aspects=[
|
|
407
|
+
models.QueryPropertiesClass(
|
|
408
|
+
statement=models.QueryStatementClass(
|
|
409
|
+
value=transformation_text,
|
|
410
|
+
language=models.QueryLanguageClass.SQL,
|
|
411
|
+
),
|
|
412
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
413
|
+
created=_empty_audit_stamp,
|
|
414
|
+
lastModified=_empty_audit_stamp,
|
|
415
|
+
),
|
|
416
|
+
make_query_subjects(list(fields_involved)),
|
|
417
|
+
],
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Build dataset update
|
|
421
|
+
updater = DatasetPatchBuilder(str(downstream_urn))
|
|
422
|
+
updater.add_upstream_lineage(
|
|
423
|
+
models.UpstreamClass(
|
|
424
|
+
dataset=str(upstream_urn),
|
|
425
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
426
|
+
query=query_urn,
|
|
427
|
+
)
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Add fine-grained lineage
|
|
431
|
+
for cl in cll or []:
|
|
432
|
+
cl.query = query_urn
|
|
433
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
434
|
+
|
|
435
|
+
# Check dataset existence
|
|
436
|
+
if not self._client._graph.exists(updater.urn):
|
|
437
|
+
raise SdkUsageError(
|
|
438
|
+
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Emit metadata change proposals
|
|
442
|
+
mcps: List[
|
|
443
|
+
Union[
|
|
444
|
+
MetadataChangeProposalWrapper,
|
|
445
|
+
models.MetadataChangeProposalClass,
|
|
446
|
+
]
|
|
447
|
+
] = list(updater.build())
|
|
448
|
+
if query_entity:
|
|
449
|
+
mcps.extend(query_entity)
|
|
450
|
+
self._client._graph.emit_mcps(mcps)
|
|
451
|
+
|
|
452
|
+
def infer_lineage_from_sql(
|
|
453
|
+
self,
|
|
454
|
+
*,
|
|
455
|
+
query_text: str,
|
|
456
|
+
platform: str,
|
|
457
|
+
platform_instance: Optional[str] = None,
|
|
458
|
+
env: str = "PROD",
|
|
459
|
+
default_db: Optional[str] = None,
|
|
460
|
+
default_schema: Optional[str] = None,
|
|
461
|
+
override_dialect: Optional[str] = None,
|
|
462
|
+
) -> None:
|
|
463
|
+
"""Add lineage by parsing a SQL query."""
|
|
464
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
465
|
+
create_lineage_sql_parsed_result,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# Parse the SQL query to extract lineage information
|
|
469
|
+
parsed_result = create_lineage_sql_parsed_result(
|
|
470
|
+
query=query_text,
|
|
471
|
+
default_db=default_db,
|
|
472
|
+
default_schema=default_schema,
|
|
473
|
+
platform=platform,
|
|
474
|
+
platform_instance=platform_instance,
|
|
475
|
+
env=env,
|
|
476
|
+
graph=self._client._graph,
|
|
477
|
+
override_dialect=override_dialect,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
if parsed_result.debug_info.table_error:
|
|
481
|
+
raise SdkUsageError(
|
|
482
|
+
f"Failed to parse SQL query: {parsed_result.debug_info.error}"
|
|
483
|
+
)
|
|
484
|
+
elif parsed_result.debug_info.column_error:
|
|
485
|
+
logger.warning(
|
|
486
|
+
f"Failed to parse SQL query: {parsed_result.debug_info.error}",
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
if not parsed_result.out_tables:
|
|
490
|
+
raise SdkUsageError(
|
|
491
|
+
"No output tables found in the query. Cannot establish lineage."
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Use the first output table as the downstream
|
|
495
|
+
downstream_urn = parsed_result.out_tables[0]
|
|
496
|
+
|
|
497
|
+
# Process all upstream tables found in the query
|
|
498
|
+
for upstream_table in parsed_result.in_tables:
|
|
499
|
+
# Skip self-lineage
|
|
500
|
+
if upstream_table == downstream_urn:
|
|
501
|
+
continue
|
|
502
|
+
|
|
503
|
+
# Extract column-level lineage for this specific upstream table
|
|
504
|
+
column_mapping = {}
|
|
505
|
+
if parsed_result.column_lineage:
|
|
506
|
+
for col_lineage in parsed_result.column_lineage:
|
|
507
|
+
if not (col_lineage.downstream and col_lineage.downstream.column):
|
|
508
|
+
continue
|
|
509
|
+
|
|
510
|
+
# Filter upstreams to only include columns from current upstream table
|
|
511
|
+
upstream_cols = [
|
|
512
|
+
ref.column
|
|
513
|
+
for ref in col_lineage.upstreams
|
|
514
|
+
if ref.table == upstream_table and ref.column
|
|
515
|
+
]
|
|
516
|
+
|
|
517
|
+
if upstream_cols:
|
|
518
|
+
column_mapping[col_lineage.downstream.column] = upstream_cols
|
|
519
|
+
|
|
520
|
+
# Add lineage, including query text
|
|
521
|
+
self.add_lineage(
|
|
522
|
+
upstream=upstream_table,
|
|
523
|
+
downstream=downstream_urn,
|
|
524
|
+
column_lineage=column_mapping,
|
|
525
|
+
transformation_text=query_text,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
@deprecated("Use add_lineage instead")
|
|
529
|
+
def add_dataset_copy_lineage(
|
|
530
|
+
self,
|
|
531
|
+
*,
|
|
532
|
+
upstream: DatasetUrnOrStr,
|
|
533
|
+
downstream: DatasetUrnOrStr,
|
|
534
|
+
column_lineage: Union[
|
|
535
|
+
None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
536
|
+
] = "auto_fuzzy",
|
|
537
|
+
) -> None:
|
|
538
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
539
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
540
|
+
|
|
541
|
+
if column_lineage is None:
|
|
542
|
+
cll = None
|
|
543
|
+
elif column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
|
|
544
|
+
upstream_schema = self._get_fields_from_dataset_urn(upstream)
|
|
545
|
+
downstream_schema = self._get_fields_from_dataset_urn(downstream)
|
|
546
|
+
if column_lineage == "auto_fuzzy":
|
|
547
|
+
mapping = self._get_fuzzy_column_lineage(
|
|
548
|
+
upstream_schema, downstream_schema
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
mapping = self._get_strict_column_lineage(
|
|
552
|
+
upstream_schema, downstream_schema
|
|
553
|
+
)
|
|
554
|
+
cll = parse_cll_mapping(
|
|
555
|
+
upstream=upstream,
|
|
556
|
+
downstream=downstream,
|
|
557
|
+
cll_mapping=mapping,
|
|
558
|
+
)
|
|
559
|
+
elif isinstance(column_lineage, dict):
|
|
560
|
+
cll = parse_cll_mapping(
|
|
561
|
+
upstream=upstream,
|
|
562
|
+
downstream=downstream,
|
|
563
|
+
cll_mapping=column_lineage,
|
|
564
|
+
)
|
|
565
|
+
else:
|
|
566
|
+
assert_never(column_lineage)
|
|
567
|
+
|
|
568
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
569
|
+
updater.add_upstream_lineage(
|
|
570
|
+
models.UpstreamClass(
|
|
571
|
+
dataset=str(upstream),
|
|
572
|
+
type=models.DatasetLineageTypeClass.COPY,
|
|
573
|
+
)
|
|
574
|
+
)
|
|
575
|
+
for cl in cll or []:
|
|
576
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
577
|
+
|
|
578
|
+
self._client.entities.update(updater)
|
|
579
|
+
|
|
580
|
+
@deprecated("Use add_lineage instead")
|
|
581
|
+
def add_dataset_transform_lineage(
|
|
582
|
+
self,
|
|
583
|
+
*,
|
|
584
|
+
upstream: DatasetUrnOrStr,
|
|
585
|
+
downstream: DatasetUrnOrStr,
|
|
586
|
+
column_lineage: Optional[ColumnLineageMapping] = None,
|
|
587
|
+
transformation_text: Optional[str] = None,
|
|
588
|
+
) -> None:
|
|
589
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
590
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
591
|
+
|
|
592
|
+
cll = None
|
|
593
|
+
if column_lineage is not None:
|
|
594
|
+
cll = parse_cll_mapping(
|
|
595
|
+
upstream=upstream,
|
|
596
|
+
downstream=downstream,
|
|
597
|
+
cll_mapping=column_lineage,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
fields_involved = OrderedSet([str(upstream), str(downstream)])
|
|
601
|
+
if cll is not None:
|
|
602
|
+
for c in cll:
|
|
603
|
+
for field in c.upstreams or []:
|
|
604
|
+
fields_involved.add(field)
|
|
605
|
+
for field in c.downstreams or []:
|
|
606
|
+
fields_involved.add(field)
|
|
607
|
+
|
|
608
|
+
query_urn = None
|
|
609
|
+
query_entity = None
|
|
610
|
+
if transformation_text:
|
|
611
|
+
# Eventually we might want to use our regex-based fingerprinting instead.
|
|
612
|
+
fingerprint = generate_hash(transformation_text)
|
|
613
|
+
query_urn = QueryUrn(fingerprint).urn()
|
|
614
|
+
|
|
615
|
+
from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
|
|
616
|
+
|
|
617
|
+
query_entity = MetadataChangeProposalWrapper.construct_many(
|
|
618
|
+
query_urn,
|
|
619
|
+
aspects=[
|
|
620
|
+
models.QueryPropertiesClass(
|
|
621
|
+
statement=models.QueryStatementClass(
|
|
622
|
+
value=transformation_text,
|
|
623
|
+
language=models.QueryLanguageClass.SQL,
|
|
624
|
+
),
|
|
625
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
626
|
+
created=_empty_audit_stamp,
|
|
627
|
+
lastModified=_empty_audit_stamp,
|
|
628
|
+
),
|
|
629
|
+
make_query_subjects(list(fields_involved)),
|
|
630
|
+
],
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
634
|
+
updater.add_upstream_lineage(
|
|
635
|
+
models.UpstreamClass(
|
|
636
|
+
dataset=str(upstream),
|
|
637
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
638
|
+
query=query_urn,
|
|
639
|
+
)
|
|
640
|
+
)
|
|
641
|
+
for cl in cll or []:
|
|
642
|
+
cl.query = query_urn
|
|
643
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
644
|
+
|
|
645
|
+
# Throw if the dataset does not exist.
|
|
646
|
+
# We need to manually call .build() instead of reusing client.update()
|
|
647
|
+
# so that we make just one emit_mcps call.
|
|
648
|
+
if not self._client._graph.exists(updater.urn):
|
|
649
|
+
raise SdkUsageError(
|
|
650
|
+
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
mcps: List[
|
|
654
|
+
Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
|
|
655
|
+
] = list(updater.build())
|
|
656
|
+
if query_entity:
|
|
657
|
+
mcps.extend(query_entity)
|
|
658
|
+
self._client._graph.emit_mcps(mcps)
|
|
659
|
+
|
|
660
|
+
@deprecated("Use add_lineage instead")
|
|
661
|
+
def add_datajob_lineage(
|
|
662
|
+
self,
|
|
663
|
+
*,
|
|
664
|
+
datajob: DatajobUrnOrStr,
|
|
665
|
+
upstreams: Optional[List[Union[DatasetUrnOrStr, DatajobUrnOrStr]]] = None,
|
|
666
|
+
downstreams: Optional[List[DatasetUrnOrStr]] = None,
|
|
667
|
+
) -> None:
|
|
668
|
+
"""
|
|
669
|
+
Add lineage between a datajob and datasets/datajobs.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
datajob: The datajob URN to connect lineage with
|
|
673
|
+
upstreams: List of upstream datasets or datajobs that serve as inputs to the datajob
|
|
674
|
+
downstreams: List of downstream datasets that are outputs of the datajob
|
|
675
|
+
"""
|
|
676
|
+
|
|
677
|
+
if not upstreams and not downstreams:
|
|
678
|
+
raise SdkUsageError("No upstreams or downstreams provided")
|
|
679
|
+
|
|
680
|
+
datajob_urn = DataJobUrn.from_string(datajob)
|
|
681
|
+
|
|
682
|
+
# Initialize the patch builder for the datajob
|
|
683
|
+
patch_builder = DataJobPatchBuilder(str(datajob_urn))
|
|
684
|
+
|
|
685
|
+
# Process upstream connections (inputs to the datajob)
|
|
686
|
+
if upstreams:
|
|
687
|
+
for upstream in upstreams:
|
|
688
|
+
# try converting to dataset urn
|
|
689
|
+
try:
|
|
690
|
+
dataset_urn = DatasetUrn.from_string(upstream)
|
|
691
|
+
patch_builder.add_input_dataset(dataset_urn)
|
|
692
|
+
except InvalidUrnError:
|
|
693
|
+
# try converting to datajob urn
|
|
694
|
+
datajob_urn = DataJobUrn.from_string(upstream)
|
|
695
|
+
patch_builder.add_input_datajob(datajob_urn)
|
|
696
|
+
|
|
697
|
+
# Process downstream connections (outputs from the datajob)
|
|
698
|
+
if downstreams:
|
|
699
|
+
for downstream in downstreams:
|
|
700
|
+
downstream_urn = DatasetUrn.from_string(downstream)
|
|
701
|
+
patch_builder.add_output_dataset(downstream_urn)
|
|
702
|
+
|
|
703
|
+
# Apply the changes to the entity
|
|
704
|
+
self._client.entities.update(patch_builder)
|
|
705
|
+
|
|
706
|
+
def get_lineage(
|
|
707
|
+
self,
|
|
708
|
+
*,
|
|
709
|
+
source_urn: Union[str, Urn],
|
|
710
|
+
source_column: Optional[str] = None,
|
|
711
|
+
direction: Literal["upstream", "downstream"] = "upstream",
|
|
712
|
+
max_hops: int = 1,
|
|
713
|
+
filter: Optional[Filter] = None,
|
|
714
|
+
count: int = 500,
|
|
715
|
+
) -> List[LineageResult]:
|
|
716
|
+
"""
|
|
717
|
+
Retrieve lineage entities connected to a source entity.
|
|
718
|
+
Args:
|
|
719
|
+
source_urn: Source URN for the lineage search
|
|
720
|
+
source_column: Source column for the lineage search
|
|
721
|
+
direction: Direction of lineage traversal
|
|
722
|
+
max_hops: Maximum number of hops to traverse
|
|
723
|
+
filter: Filters to apply to the lineage search
|
|
724
|
+
count: Maximum number of results to return
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
List of lineage results
|
|
728
|
+
|
|
729
|
+
Raises:
|
|
730
|
+
SdkUsageError for invalid filter values
|
|
731
|
+
"""
|
|
732
|
+
# Validate and convert input URN
|
|
733
|
+
source_urn = Urn.from_string(source_urn)
|
|
734
|
+
# Prepare GraphQL query variables with a separate method
|
|
735
|
+
variables = self._process_input_variables(
|
|
736
|
+
source_urn, source_column, filter, direction, max_hops, count
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
return self._execute_lineage_query(variables, direction)
|
|
740
|
+
|
|
741
|
+
def _process_input_variables(
|
|
742
|
+
self,
|
|
743
|
+
source_urn: Urn,
|
|
744
|
+
source_column: Optional[str] = None,
|
|
745
|
+
filters: Optional[Filter] = None,
|
|
746
|
+
direction: Literal["upstream", "downstream"] = "upstream",
|
|
747
|
+
max_hops: int = 1,
|
|
748
|
+
count: int = 500,
|
|
749
|
+
) -> Dict[str, Any]:
|
|
750
|
+
"""
|
|
751
|
+
Process filters and prepare GraphQL query variables for lineage search.
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
source_urn: Source URN for the lineage search
|
|
755
|
+
source_column: Source column for the lineage search
|
|
756
|
+
filters: Optional filters to apply
|
|
757
|
+
direction: Direction of lineage traversal
|
|
758
|
+
max_hops: Maximum number of hops to traverse
|
|
759
|
+
count: Maximum number of results to return
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
Dictionary of GraphQL query variables
|
|
763
|
+
|
|
764
|
+
Raises:
|
|
765
|
+
SdkUsageError for invalid filter values
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
# print warning if max_hops is greater than 2
|
|
769
|
+
if max_hops > 2:
|
|
770
|
+
logger.warning(
|
|
771
|
+
"""If `max_hops` is more than 2, the search will try to find the full lineage graph.
|
|
772
|
+
By default, only 500 results are shown.
|
|
773
|
+
You can change the `count` to get more or fewer results.
|
|
774
|
+
"""
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
# Determine hop values
|
|
778
|
+
max_hop_values = (
|
|
779
|
+
[str(hop) for hop in range(1, max_hops + 1)]
|
|
780
|
+
if max_hops <= 2
|
|
781
|
+
else ["1", "2", "3+"]
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
degree_filter = FilterDsl.custom_filter(
|
|
785
|
+
field="degree",
|
|
786
|
+
condition="EQUAL",
|
|
787
|
+
values=max_hop_values,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
filters_with_max_hops = (
|
|
791
|
+
FilterDsl.and_(degree_filter, filters)
|
|
792
|
+
if filters is not None
|
|
793
|
+
else degree_filter
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
types, compiled_filters = compile_filters(filters_with_max_hops)
|
|
797
|
+
|
|
798
|
+
# Prepare base variables
|
|
799
|
+
variables: Dict[str, Any] = {
|
|
800
|
+
"input": {
|
|
801
|
+
"urn": str(source_urn),
|
|
802
|
+
"direction": direction.upper(),
|
|
803
|
+
"count": count,
|
|
804
|
+
"types": types,
|
|
805
|
+
"orFilters": compiled_filters,
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
# if column is provided, update the variables to include the schema field urn
|
|
810
|
+
if isinstance(source_urn, SchemaFieldUrn) or source_column:
|
|
811
|
+
variables["input"]["searchFlags"] = {
|
|
812
|
+
"groupingSpec": {
|
|
813
|
+
"groupingCriteria": {
|
|
814
|
+
"baseEntityType": "SCHEMA_FIELD",
|
|
815
|
+
"groupingEntityType": "SCHEMA_FIELD",
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
if isinstance(source_urn, SchemaFieldUrn):
|
|
820
|
+
variables["input"]["urn"] = str(source_urn)
|
|
821
|
+
elif source_column:
|
|
822
|
+
variables["input"]["urn"] = str(SchemaFieldUrn(source_urn, source_column))
|
|
823
|
+
|
|
824
|
+
return variables
|
|
825
|
+
|
|
826
|
+
def _execute_lineage_query(
|
|
827
|
+
self,
|
|
828
|
+
variables: Dict[str, Any],
|
|
829
|
+
direction: Literal["upstream", "downstream"],
|
|
830
|
+
) -> List[LineageResult]:
|
|
831
|
+
"""Execute GraphQL query and process results."""
|
|
832
|
+
# Construct GraphQL query with dynamic path query
|
|
833
|
+
graphql_query = """
|
|
834
|
+
query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
|
|
835
|
+
scrollAcrossLineage(input: $input) {
|
|
836
|
+
nextScrollId
|
|
837
|
+
searchResults {
|
|
838
|
+
degree
|
|
839
|
+
entity {
|
|
840
|
+
urn
|
|
841
|
+
type
|
|
842
|
+
... on Dataset {
|
|
843
|
+
name
|
|
844
|
+
platform {
|
|
845
|
+
name
|
|
846
|
+
}
|
|
847
|
+
properties {
|
|
848
|
+
description
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
... on DataJob {
|
|
852
|
+
jobId
|
|
853
|
+
dataPlatformInstance {
|
|
854
|
+
platform {
|
|
855
|
+
name
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
properties {
|
|
859
|
+
name
|
|
860
|
+
description
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
paths {
|
|
865
|
+
path {
|
|
866
|
+
urn
|
|
867
|
+
type
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
"""
|
|
874
|
+
|
|
875
|
+
results: List[LineageResult] = []
|
|
876
|
+
|
|
877
|
+
first_iter = True
|
|
878
|
+
scroll_id: Optional[str] = None
|
|
879
|
+
|
|
880
|
+
while first_iter or scroll_id:
|
|
881
|
+
first_iter = False
|
|
882
|
+
|
|
883
|
+
# Update scroll ID if applicable
|
|
884
|
+
if scroll_id:
|
|
885
|
+
variables["input"]["scrollId"] = scroll_id
|
|
886
|
+
|
|
887
|
+
# Execute GraphQL query
|
|
888
|
+
response = self._graph.execute_graphql(graphql_query, variables=variables)
|
|
889
|
+
data = response["scrollAcrossLineage"]
|
|
890
|
+
scroll_id = data.get("nextScrollId")
|
|
891
|
+
|
|
892
|
+
# Process search results
|
|
893
|
+
for entry in data["searchResults"]:
|
|
894
|
+
entity = entry["entity"]
|
|
895
|
+
|
|
896
|
+
result = self._create_lineage_result(entity, entry, direction)
|
|
897
|
+
results.append(result)
|
|
898
|
+
|
|
899
|
+
return results
|
|
900
|
+
|
|
901
|
+
def _create_lineage_result(
|
|
902
|
+
self,
|
|
903
|
+
entity: Dict[str, Any],
|
|
904
|
+
entry: Dict[str, Any],
|
|
905
|
+
direction: Literal["upstream", "downstream"],
|
|
906
|
+
) -> LineageResult:
|
|
907
|
+
"""Create a LineageResult from entity and entry data."""
|
|
908
|
+
platform = (entity.get("platform") or {}).get("name") or (
|
|
909
|
+
(entity.get("dataPlatformInstance") or {}).get("platform") or {}
|
|
910
|
+
).get("name")
|
|
911
|
+
|
|
912
|
+
result = LineageResult(
|
|
913
|
+
urn=entity["urn"],
|
|
914
|
+
type=entity["type"],
|
|
915
|
+
hops=entry["degree"],
|
|
916
|
+
direction=direction,
|
|
917
|
+
platform=platform,
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
properties = entity.get("properties", {})
|
|
921
|
+
if properties:
|
|
922
|
+
result.name = properties.get("name", "")
|
|
923
|
+
result.description = properties.get("description", "")
|
|
924
|
+
|
|
925
|
+
result.paths = []
|
|
926
|
+
if "paths" in entry:
|
|
927
|
+
# Process each path in the lineage graph
|
|
928
|
+
for path in entry["paths"]:
|
|
929
|
+
for path_entry in path["path"]:
|
|
930
|
+
# Only include schema fields in the path (exclude other types like Query)
|
|
931
|
+
if path_entry["type"] == "SCHEMA_FIELD":
|
|
932
|
+
schema_field_urn = SchemaFieldUrn.from_string(path_entry["urn"])
|
|
933
|
+
result.paths.append(
|
|
934
|
+
LineagePath(
|
|
935
|
+
urn=path_entry["urn"],
|
|
936
|
+
entity_name=DatasetUrn.from_string(
|
|
937
|
+
schema_field_urn.parent
|
|
938
|
+
).name,
|
|
939
|
+
column_name=schema_field_urn.field_path,
|
|
940
|
+
)
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
return result
|