acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
|
|
|
18
18
|
from trino.sqlalchemy import datatype
|
|
19
19
|
from trino.sqlalchemy.dialect import TrinoDialect
|
|
20
20
|
|
|
21
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
21
22
|
from datahub.configuration.source_common import (
|
|
22
23
|
EnvConfigMixin,
|
|
23
24
|
PlatformInstanceConfigMixin,
|
|
@@ -36,6 +37,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
38
|
from datahub.ingestion.extractor import schema_util
|
|
38
39
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
40
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
39
41
|
from datahub.ingestion.source.sql.sql_common import (
|
|
40
42
|
SQLAlchemySource,
|
|
41
43
|
SqlWorkUnit,
|
|
@@ -128,24 +130,47 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
128
130
|
if catalog_name is None:
|
|
129
131
|
raise exc.NoSuchTableError("catalog is required in connection")
|
|
130
132
|
connector_name = get_catalog_connector_name(connection.engine, catalog_name)
|
|
131
|
-
if
|
|
132
|
-
|
|
133
|
-
|
|
133
|
+
if (
|
|
134
|
+
connector_name is not None
|
|
135
|
+
and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
|
|
136
|
+
):
|
|
134
137
|
properties_table = self._get_full_table(f"{table_name}$properties", schema)
|
|
135
138
|
query = f"SELECT * FROM {properties_table}"
|
|
136
|
-
|
|
139
|
+
rows = connection.execute(sql.text(query)).fetchall()
|
|
137
140
|
|
|
138
141
|
# Generate properties dictionary.
|
|
139
142
|
properties = {}
|
|
140
|
-
|
|
143
|
+
|
|
144
|
+
if len(rows) == 0:
|
|
145
|
+
# No properties found, return empty dictionary
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
# Check if using the old format (key, value columns)
|
|
149
|
+
if (
|
|
150
|
+
connector_name == "iceberg"
|
|
151
|
+
and len(rows[0]) == 2
|
|
152
|
+
and "key" in rows[0]
|
|
153
|
+
and "value" in rows[0]
|
|
154
|
+
):
|
|
155
|
+
# https://trino.io/docs/current/connector/iceberg.html#properties-table
|
|
156
|
+
for row in rows:
|
|
157
|
+
if row["value"] is not None:
|
|
158
|
+
properties[row["key"]] = row["value"]
|
|
159
|
+
return {"text": properties.get("comment"), "properties": properties}
|
|
160
|
+
elif connector_name == "hive" and len(rows[0]) > 1 and len(rows) == 1:
|
|
161
|
+
# https://trino.io/docs/current/connector/hive.html#properties-table
|
|
162
|
+
row = rows[0]
|
|
141
163
|
for col_name, col_value in row.items():
|
|
142
164
|
if col_value is not None:
|
|
143
165
|
properties[col_name] = col_value
|
|
166
|
+
return {"text": properties.get("comment"), "properties": properties}
|
|
144
167
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
168
|
+
# If we can't get the properties we still fallback to the default
|
|
169
|
+
return self.get_table_comment_default(connection, table_name, schema)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logging.warning(
|
|
172
|
+
f"Failed to get table comment for {table_name} in {schema}: {e}"
|
|
173
|
+
)
|
|
149
174
|
return {}
|
|
150
175
|
|
|
151
176
|
|
|
@@ -198,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
198
223
|
|
|
199
224
|
class TrinoConfig(BasicSQLAlchemyConfig):
|
|
200
225
|
# defaults
|
|
201
|
-
scheme: str = Field(default="trino"
|
|
226
|
+
scheme: HiddenFromDocs[str] = Field(default="trino")
|
|
202
227
|
database: str = Field(description="database (catalog)")
|
|
203
228
|
|
|
204
229
|
catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
|
|
@@ -226,6 +251,14 @@ class TrinoConfig(BasicSQLAlchemyConfig):
|
|
|
226
251
|
@support_status(SupportStatus.CERTIFIED)
|
|
227
252
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
228
253
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
254
|
+
@capability(
|
|
255
|
+
SourceCapability.LINEAGE_COARSE,
|
|
256
|
+
"Extract table-level lineage",
|
|
257
|
+
subtype_modifier=[
|
|
258
|
+
SourceCapabilityModifier.TABLE,
|
|
259
|
+
SourceCapabilityModifier.VIEW,
|
|
260
|
+
],
|
|
261
|
+
)
|
|
229
262
|
class TrinoSource(SQLAlchemySource):
|
|
230
263
|
"""
|
|
231
264
|
|
|
@@ -7,19 +7,19 @@ from sqlalchemy import create_engine, inspect
|
|
|
7
7
|
from sqlalchemy.engine import URL
|
|
8
8
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
12
12
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
14
14
|
from datahub.ingestion.source.sql.sql_common import SQLAlchemySource, logger
|
|
15
15
|
from datahub.ingestion.source.sql.sql_config import (
|
|
16
16
|
BasicSQLAlchemyConfig,
|
|
17
|
-
make_sqlalchemy_uri,
|
|
18
17
|
)
|
|
19
18
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
20
19
|
add_table_to_schema_container,
|
|
21
20
|
gen_database_key,
|
|
22
21
|
)
|
|
22
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
|
|
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
|
|
|
27
27
|
default=AllowDenyPattern.allow_all(),
|
|
28
28
|
description="Regex patterns for databases to filter in ingestion.",
|
|
29
29
|
)
|
|
30
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
30
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
31
31
|
# The superclass contains a `schema_pattern` field, so we need this here
|
|
32
32
|
# to override the documentation.
|
|
33
33
|
default=AllowDenyPattern.allow_all(),
|
|
34
|
-
hidden_from_docs=True,
|
|
35
34
|
description="Deprecated in favour of database_pattern.",
|
|
36
35
|
)
|
|
37
36
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
|
|
7
|
+
import pytest
|
|
8
|
+
from pydantic import validator
|
|
8
9
|
from vertica_sqlalchemy_dialect.base import VerticaInspector
|
|
9
10
|
|
|
10
11
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
26
|
)
|
|
26
27
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
27
28
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
29
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
30
|
+
DatasetSubTypes,
|
|
31
|
+
SourceCapabilityModifier,
|
|
32
|
+
)
|
|
28
33
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
34
|
SQLAlchemySource,
|
|
30
35
|
SqlWorkUnit,
|
|
@@ -41,7 +46,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
|
41
46
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
42
47
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
43
48
|
from datahub.metadata.schema_classes import (
|
|
44
|
-
ChangeTypeClass,
|
|
45
49
|
DatasetLineageTypeClass,
|
|
46
50
|
DatasetPropertiesClass,
|
|
47
51
|
SubTypesClass,
|
|
@@ -52,6 +56,8 @@ from datahub.utilities import config_clean
|
|
|
52
56
|
|
|
53
57
|
if TYPE_CHECKING:
|
|
54
58
|
from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
|
|
59
|
+
|
|
60
|
+
pytestmark = pytest.mark.integration_batch_4
|
|
55
61
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
56
62
|
|
|
57
63
|
|
|
@@ -113,10 +119,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
|
|
|
113
119
|
@capability(
|
|
114
120
|
SourceCapability.LINEAGE_COARSE,
|
|
115
121
|
"Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
|
|
122
|
+
subtype_modifier=[
|
|
123
|
+
SourceCapabilityModifier.VIEW,
|
|
124
|
+
SourceCapabilityModifier.PROJECTIONS,
|
|
125
|
+
],
|
|
116
126
|
)
|
|
117
127
|
@capability(
|
|
118
128
|
SourceCapability.DELETION_DETECTION,
|
|
119
|
-
"
|
|
129
|
+
"Enabled by default via stateful ingestion",
|
|
120
130
|
supported=True,
|
|
121
131
|
)
|
|
122
132
|
class VerticaSource(SQLAlchemySource):
|
|
@@ -493,11 +503,8 @@ class VerticaSource(SQLAlchemySource):
|
|
|
493
503
|
if dpi_aspect:
|
|
494
504
|
yield dpi_aspect
|
|
495
505
|
yield MetadataChangeProposalWrapper(
|
|
496
|
-
entityType="dataset",
|
|
497
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
498
506
|
entityUrn=dataset_urn,
|
|
499
|
-
|
|
500
|
-
aspect=SubTypesClass(typeNames=["Projections"]),
|
|
507
|
+
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
501
508
|
).as_workunit()
|
|
502
509
|
|
|
503
510
|
if self.config.domain:
|
|
@@ -2,21 +2,22 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Iterable, List, Optional,
|
|
7
|
+
from typing import ClassVar, Iterable, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from pydantic import Field
|
|
9
|
+
from pydantic import BaseModel, Field, validator
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
12
|
+
from datahub.configuration.datetimes import parse_user_datetime
|
|
11
13
|
from datahub.configuration.source_common import (
|
|
12
14
|
EnvConfigMixin,
|
|
13
15
|
PlatformInstanceConfigMixin,
|
|
14
16
|
)
|
|
15
17
|
from datahub.emitter.mce_builder import (
|
|
16
18
|
make_dataset_urn_with_platform_instance,
|
|
17
|
-
make_user_urn,
|
|
18
19
|
)
|
|
19
|
-
from datahub.emitter.
|
|
20
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
22
|
from datahub.ingestion.api.decorators import (
|
|
22
23
|
SupportStatus,
|
|
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
26
|
platform_name,
|
|
26
27
|
support_status,
|
|
27
28
|
)
|
|
29
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
30
|
+
IncrementalLineageConfigMixin,
|
|
31
|
+
auto_incremental_lineage,
|
|
32
|
+
)
|
|
28
33
|
from datahub.ingestion.api.source import (
|
|
29
34
|
MetadataWorkUnitProcessor,
|
|
30
35
|
Source,
|
|
@@ -35,13 +40,21 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
|
35
40
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
41
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
37
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
43
|
+
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
38
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
39
|
-
from datahub.sql_parsing.
|
|
45
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
46
|
+
KnownQueryLineageInfo,
|
|
47
|
+
ObservedQuery,
|
|
48
|
+
SqlAggregatorReport,
|
|
49
|
+
SqlParsingAggregator,
|
|
50
|
+
)
|
|
40
51
|
|
|
41
52
|
logger = logging.getLogger(__name__)
|
|
42
53
|
|
|
43
54
|
|
|
44
|
-
class SqlQueriesSourceConfig(
|
|
55
|
+
class SqlQueriesSourceConfig(
|
|
56
|
+
PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
|
|
57
|
+
):
|
|
45
58
|
query_file: str = Field(description="Path to file to ingest")
|
|
46
59
|
|
|
47
60
|
platform: str = Field(
|
|
@@ -53,45 +66,34 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
53
66
|
default=BaseUsageConfig(),
|
|
54
67
|
)
|
|
55
68
|
|
|
56
|
-
use_schema_resolver: bool = Field(
|
|
69
|
+
use_schema_resolver: HiddenFromDocs[bool] = Field(
|
|
70
|
+
True,
|
|
57
71
|
description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
|
|
58
|
-
default=True,
|
|
59
|
-
hidden_from_docs=True,
|
|
60
72
|
)
|
|
61
73
|
default_db: Optional[str] = Field(
|
|
74
|
+
None,
|
|
62
75
|
description="The default database to use for unqualified table names",
|
|
63
|
-
default=None,
|
|
64
76
|
)
|
|
65
77
|
default_schema: Optional[str] = Field(
|
|
78
|
+
None,
|
|
66
79
|
description="The default schema to use for unqualified table names",
|
|
67
|
-
default=None,
|
|
68
80
|
)
|
|
69
|
-
|
|
81
|
+
override_dialect: Optional[str] = Field(
|
|
82
|
+
None,
|
|
70
83
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
71
|
-
default=None,
|
|
72
84
|
)
|
|
73
85
|
|
|
74
86
|
|
|
87
|
+
@dataclass
|
|
75
88
|
class SqlQueriesSourceReport(SourceReport):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def compute_stats(self) -> None:
|
|
81
|
-
super().compute_stats()
|
|
82
|
-
self.table_failure_rate = (
|
|
83
|
-
f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
|
|
84
|
-
if self.num_queries_parsed
|
|
85
|
-
else "0"
|
|
86
|
-
)
|
|
87
|
-
self.column_failure_rate = (
|
|
88
|
-
f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
|
|
89
|
-
if self.num_queries_parsed
|
|
90
|
-
else "0"
|
|
91
|
-
)
|
|
89
|
+
num_entries_processed: int = 0
|
|
90
|
+
num_entries_failed: int = 0
|
|
91
|
+
num_queries_aggregator_failures: int = 0
|
|
92
92
|
|
|
93
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
93
94
|
|
|
94
|
-
|
|
95
|
+
|
|
96
|
+
@platform_name("SQL Queries", id="sql-queries")
|
|
95
97
|
@config_class(SqlQueriesSourceConfig)
|
|
96
98
|
@support_status(SupportStatus.INCUBATING)
|
|
97
99
|
@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries")
|
|
@@ -107,15 +109,25 @@ class SqlQueriesSource(Source):
|
|
|
107
109
|
- user (optional): string - The user who ran the query.
|
|
108
110
|
This user value will be directly converted into a DataHub user urn.
|
|
109
111
|
- operation_type (optional): string - Platform-specific operation type, used if the operation type can't be parsed.
|
|
112
|
+
- session_id (optional): string - Session identifier for temporary table resolution across queries.
|
|
110
113
|
- downstream_tables (optional): string[] - Fallback list of tables that the query writes to,
|
|
111
114
|
used if the query can't be parsed.
|
|
112
115
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
113
116
|
used if the query can't be parsed.
|
|
117
|
+
|
|
118
|
+
### Incremental Lineage
|
|
119
|
+
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
120
|
+
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
121
|
+
- Gradually building up lineage from multiple sources
|
|
122
|
+
- Preserving manually curated lineage
|
|
123
|
+
- Avoiding conflicts when multiple ingestion processes target the same datasets
|
|
124
|
+
|
|
125
|
+
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
126
|
+
statistics will still be emitted normally.
|
|
114
127
|
"""
|
|
115
128
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
builder: SqlParsingBuilder
|
|
129
|
+
schema_resolver: Optional[SchemaResolver]
|
|
130
|
+
aggregator: SqlParsingAggregator
|
|
119
131
|
|
|
120
132
|
def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
|
|
121
133
|
if not ctx.graph:
|
|
@@ -128,22 +140,36 @@ class SqlQueriesSource(Source):
|
|
|
128
140
|
self.config = config
|
|
129
141
|
self.report = SqlQueriesSourceReport()
|
|
130
142
|
|
|
131
|
-
self.builder = SqlParsingBuilder(usage_config=self.config.usage)
|
|
132
|
-
|
|
133
143
|
if self.config.use_schema_resolver:
|
|
144
|
+
# TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
|
|
145
|
+
# for the given platform, platform instance, and env. Instead this should be configurable:
|
|
146
|
+
# bulk initialization vs lazy on-demand schema fetching.
|
|
134
147
|
self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
|
|
135
148
|
platform=self.config.platform,
|
|
136
149
|
platform_instance=self.config.platform_instance,
|
|
137
150
|
env=self.config.env,
|
|
138
151
|
)
|
|
139
|
-
self.urns = self.schema_resolver.get_urns()
|
|
140
152
|
else:
|
|
141
|
-
self.schema_resolver =
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
self.
|
|
153
|
+
self.schema_resolver = None
|
|
154
|
+
|
|
155
|
+
self.aggregator = SqlParsingAggregator(
|
|
156
|
+
platform=self.config.platform,
|
|
157
|
+
platform_instance=self.config.platform_instance,
|
|
158
|
+
env=self.config.env,
|
|
159
|
+
schema_resolver=self.schema_resolver,
|
|
160
|
+
eager_graph_load=False,
|
|
161
|
+
generate_lineage=True, # TODO: make this configurable
|
|
162
|
+
generate_queries=True, # TODO: make this configurable
|
|
163
|
+
generate_query_subject_fields=True, # TODO: make this configurable
|
|
164
|
+
generate_query_usage_statistics=True, # This enables publishing SELECT query entities, otherwise only mutation queries are published
|
|
165
|
+
generate_usage_statistics=True,
|
|
166
|
+
generate_operations=True, # TODO: make this configurable
|
|
167
|
+
usage_config=self.config.usage,
|
|
168
|
+
is_temp_table=None,
|
|
169
|
+
is_allowed_table=None,
|
|
170
|
+
format_queries=False,
|
|
171
|
+
)
|
|
172
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
147
173
|
|
|
148
174
|
@classmethod
|
|
149
175
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
|
|
@@ -154,100 +180,172 @@ class SqlQueriesSource(Source):
|
|
|
154
180
|
return self.report
|
|
155
181
|
|
|
156
182
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
157
|
-
return [
|
|
183
|
+
return [
|
|
184
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
185
|
+
partial(
|
|
186
|
+
auto_incremental_lineage,
|
|
187
|
+
self.config.incremental_lineage,
|
|
188
|
+
),
|
|
189
|
+
]
|
|
158
190
|
|
|
159
|
-
def get_workunits_internal(
|
|
191
|
+
def get_workunits_internal(
|
|
192
|
+
self,
|
|
193
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
160
194
|
logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
|
|
195
|
+
|
|
196
|
+
with self.report.new_stage("Collecting queries from file"):
|
|
197
|
+
queries = list(self._parse_query_file())
|
|
198
|
+
logger.info(f"Collected {len(queries)} queries for processing")
|
|
199
|
+
|
|
200
|
+
with self.report.new_stage("Processing queries through SQL parsing aggregator"):
|
|
201
|
+
for query_entry in queries:
|
|
202
|
+
self._add_query_to_aggregator(query_entry)
|
|
203
|
+
|
|
204
|
+
with self.report.new_stage("Generating metadata work units"):
|
|
205
|
+
logger.info("Generating workunits from SQL parsing aggregator")
|
|
206
|
+
yield from self.aggregator.gen_metadata()
|
|
207
|
+
|
|
208
|
+
def _parse_query_file(self) -> Iterable["QueryEntry"]:
|
|
209
|
+
"""Parse the query file and yield QueryEntry objects."""
|
|
161
210
|
with open(self.config.query_file) as f:
|
|
162
211
|
for line in f:
|
|
163
212
|
try:
|
|
164
213
|
query_dict = json.loads(line, strict=False)
|
|
165
214
|
entry = QueryEntry.create(query_dict, config=self.config)
|
|
166
|
-
|
|
215
|
+
self.report.num_entries_processed += 1
|
|
216
|
+
if self.report.num_entries_processed % 1000 == 0:
|
|
217
|
+
logger.info(
|
|
218
|
+
f"Processed {self.report.num_entries_processed} query entries"
|
|
219
|
+
)
|
|
220
|
+
yield entry
|
|
167
221
|
except Exception as e:
|
|
168
|
-
|
|
169
|
-
self.report.
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
self.report.num_queries_parsed += 1
|
|
176
|
-
if self.report.num_queries_parsed % 1000 == 0:
|
|
177
|
-
logger.info(f"Parsed {self.report.num_queries_parsed} queries")
|
|
222
|
+
self.report.num_entries_failed += 1
|
|
223
|
+
self.report.warning(
|
|
224
|
+
title="Error processing query",
|
|
225
|
+
message="Query skipped due to parsing error",
|
|
226
|
+
context=line.strip(),
|
|
227
|
+
exc=e,
|
|
228
|
+
)
|
|
178
229
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
230
|
+
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
231
|
+
"""Add a query to the SQL parsing aggregator."""
|
|
232
|
+
try:
|
|
233
|
+
# If we have both upstream and downstream tables, use explicit lineage
|
|
234
|
+
if query_entry.upstream_tables and query_entry.downstream_tables:
|
|
235
|
+
logger.debug("Using explicit lineage from query file")
|
|
236
|
+
for downstream_table in query_entry.downstream_tables:
|
|
237
|
+
known_lineage = KnownQueryLineageInfo(
|
|
238
|
+
query_text=query_entry.query,
|
|
239
|
+
downstream=str(downstream_table),
|
|
240
|
+
upstreams=[str(urn) for urn in query_entry.upstream_tables],
|
|
241
|
+
timestamp=query_entry.timestamp,
|
|
242
|
+
session_id=query_entry.session_id,
|
|
243
|
+
)
|
|
244
|
+
self.aggregator.add_known_query_lineage(known_lineage)
|
|
245
|
+
else:
|
|
246
|
+
# Warn if only partial lineage information is provided
|
|
247
|
+
# XOR: true if exactly one of upstream_tables or downstream_tables is provided
|
|
248
|
+
if bool(query_entry.upstream_tables) ^ bool(
|
|
249
|
+
query_entry.downstream_tables
|
|
250
|
+
):
|
|
251
|
+
query_preview = (
|
|
252
|
+
query_entry.query[:150] + "..."
|
|
253
|
+
if len(query_entry.query) > 150
|
|
254
|
+
else query_entry.query
|
|
255
|
+
)
|
|
256
|
+
missing_upstream = (
|
|
257
|
+
"Missing upstream. " if not query_entry.upstream_tables else ""
|
|
258
|
+
)
|
|
259
|
+
missing_downstream = (
|
|
260
|
+
"Missing downstream. "
|
|
261
|
+
if not query_entry.downstream_tables
|
|
262
|
+
else ""
|
|
263
|
+
)
|
|
264
|
+
logger.info(
|
|
265
|
+
f"Only partial lineage information provided, falling back to SQL parsing for complete lineage detection. {missing_upstream}{missing_downstream}Query: {query_preview}"
|
|
266
|
+
)
|
|
267
|
+
# No explicit lineage, rely on parsing
|
|
268
|
+
observed_query = ObservedQuery(
|
|
269
|
+
query=query_entry.query,
|
|
270
|
+
timestamp=query_entry.timestamp,
|
|
271
|
+
user=query_entry.user,
|
|
272
|
+
session_id=query_entry.session_id,
|
|
273
|
+
default_db=self.config.default_db,
|
|
274
|
+
default_schema=self.config.default_schema,
|
|
275
|
+
override_dialect=self.config.override_dialect,
|
|
195
276
|
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
277
|
+
self.aggregator.add_observed_query(observed_query)
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
self.report.num_queries_aggregator_failures += 1
|
|
281
|
+
self.report.warning(
|
|
282
|
+
title="Error adding query to aggregator",
|
|
283
|
+
message="Query skipped due to failure when adding query to SQL parsing aggregator",
|
|
284
|
+
context=query_entry.query,
|
|
285
|
+
exc=e,
|
|
200
286
|
)
|
|
201
|
-
self.report.num_column_parse_failures += 1
|
|
202
|
-
|
|
203
|
-
yield from self.builder.process_sql_parsing_result(
|
|
204
|
-
result,
|
|
205
|
-
query=entry.query,
|
|
206
|
-
query_timestamp=entry.timestamp,
|
|
207
|
-
user=entry.user,
|
|
208
|
-
custom_operation_type=entry.operation_type,
|
|
209
|
-
include_urns=self.urns,
|
|
210
|
-
)
|
|
211
287
|
|
|
212
288
|
|
|
213
|
-
|
|
214
|
-
class QueryEntry:
|
|
289
|
+
class QueryEntry(BaseModel):
|
|
215
290
|
query: str
|
|
216
|
-
timestamp: Optional[datetime]
|
|
217
|
-
user: Optional[
|
|
218
|
-
operation_type: Optional[str]
|
|
219
|
-
downstream_tables: List[
|
|
220
|
-
upstream_tables: List[
|
|
291
|
+
timestamp: Optional[datetime] = None
|
|
292
|
+
user: Optional[CorpUserUrn] = None
|
|
293
|
+
operation_type: Optional[str] = None
|
|
294
|
+
downstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
295
|
+
upstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
296
|
+
session_id: Optional[str] = None
|
|
297
|
+
|
|
298
|
+
# Validation context for URN creation
|
|
299
|
+
_validation_context: ClassVar[Optional[SqlQueriesSourceConfig]] = None
|
|
300
|
+
|
|
301
|
+
class Config:
|
|
302
|
+
arbitrary_types_allowed = True
|
|
303
|
+
|
|
304
|
+
@validator("timestamp", pre=True)
|
|
305
|
+
def parse_timestamp(cls, v):
|
|
306
|
+
return None if v is None else parse_user_datetime(str(v))
|
|
307
|
+
|
|
308
|
+
@validator("user", pre=True)
|
|
309
|
+
def parse_user(cls, v):
|
|
310
|
+
if v is None:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
|
|
314
|
+
|
|
315
|
+
@validator("downstream_tables", "upstream_tables", pre=True)
|
|
316
|
+
def parse_tables(cls, v):
|
|
317
|
+
if not v:
|
|
318
|
+
return []
|
|
319
|
+
|
|
320
|
+
result = []
|
|
321
|
+
for item in v:
|
|
322
|
+
if isinstance(item, DatasetUrn):
|
|
323
|
+
result.append(item)
|
|
324
|
+
elif isinstance(item, str):
|
|
325
|
+
# Skip empty/whitespace-only strings
|
|
326
|
+
if item and item.strip():
|
|
327
|
+
# Convert to URN using validation context
|
|
328
|
+
assert cls._validation_context, (
|
|
329
|
+
"Validation context must be set for URN creation"
|
|
330
|
+
)
|
|
331
|
+
urn_string = make_dataset_urn_with_platform_instance(
|
|
332
|
+
name=item,
|
|
333
|
+
platform=cls._validation_context.platform,
|
|
334
|
+
platform_instance=cls._validation_context.platform_instance,
|
|
335
|
+
env=cls._validation_context.env,
|
|
336
|
+
)
|
|
337
|
+
result.append(DatasetUrn.from_string(urn_string))
|
|
338
|
+
|
|
339
|
+
return result
|
|
221
340
|
|
|
222
341
|
@classmethod
|
|
223
342
|
def create(
|
|
224
343
|
cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
|
|
225
344
|
) -> "QueryEntry":
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None,
|
|
234
|
-
operation_type=entry_dict.get("operation_type"),
|
|
235
|
-
downstream_tables=[
|
|
236
|
-
make_dataset_urn_with_platform_instance(
|
|
237
|
-
name=table,
|
|
238
|
-
platform=config.platform,
|
|
239
|
-
platform_instance=config.platform_instance,
|
|
240
|
-
env=config.env,
|
|
241
|
-
)
|
|
242
|
-
for table in entry_dict.get("downstream_tables", [])
|
|
243
|
-
],
|
|
244
|
-
upstream_tables=[
|
|
245
|
-
make_dataset_urn_with_platform_instance(
|
|
246
|
-
name=table,
|
|
247
|
-
platform=config.platform,
|
|
248
|
-
platform_instance=config.platform_instance,
|
|
249
|
-
env=config.env,
|
|
250
|
-
)
|
|
251
|
-
for table in entry_dict.get("upstream_tables", [])
|
|
252
|
-
],
|
|
253
|
-
)
|
|
345
|
+
"""Create QueryEntry from dict with config context."""
|
|
346
|
+
# Set validation context for URN creation
|
|
347
|
+
cls._validation_context = config
|
|
348
|
+
try:
|
|
349
|
+
return cls.parse_obj(entry_dict)
|
|
350
|
+
finally:
|
|
351
|
+
cls._validation_context = None
|