acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -70,3 +70,30 @@ class HasStructuredPropertiesPatch(MetadataPatchProposal):
|
|
|
70
70
|
),
|
|
71
71
|
)
|
|
72
72
|
return self
|
|
73
|
+
|
|
74
|
+
def set_structured_property_manual(
|
|
75
|
+
self, property: StructuredPropertyValueAssignmentClass
|
|
76
|
+
) -> Self:
|
|
77
|
+
"""Add or update a structured property, using a StructuredPropertyValueAssignmentClass object."""
|
|
78
|
+
|
|
79
|
+
self.remove_structured_property(property.propertyUrn)
|
|
80
|
+
self._add_patch(
|
|
81
|
+
StructuredPropertiesClass.ASPECT_NAME,
|
|
82
|
+
"add",
|
|
83
|
+
path=("properties", property.propertyUrn),
|
|
84
|
+
value=property,
|
|
85
|
+
)
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
def add_structured_property_manual(
|
|
89
|
+
self, property: StructuredPropertyValueAssignmentClass
|
|
90
|
+
) -> Self:
|
|
91
|
+
"""Add a structured property, using a StructuredPropertyValueAssignmentClass object."""
|
|
92
|
+
|
|
93
|
+
self._add_patch(
|
|
94
|
+
StructuredPropertiesClass.ASPECT_NAME,
|
|
95
|
+
"add",
|
|
96
|
+
path=("properties", property.propertyUrn),
|
|
97
|
+
value=property,
|
|
98
|
+
)
|
|
99
|
+
return self
|
datahub/specific/chart.py
CHANGED
datahub/specific/datajob.py
CHANGED
|
@@ -1,15 +1,19 @@
|
|
|
1
|
-
from typing import List, Optional, Tuple, Union
|
|
1
|
+
from typing import List, Optional, Set, Tuple, Union
|
|
2
2
|
|
|
3
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
|
|
4
4
|
from datahub.metadata.schema_classes import (
|
|
5
5
|
DataJobInfoClass as DataJobInfo,
|
|
6
6
|
DataJobInputOutputClass as DataJobInputOutput,
|
|
7
7
|
EdgeClass as Edge,
|
|
8
|
+
FineGrainedLineageClass as FineGrainedLineage,
|
|
8
9
|
KafkaAuditHeaderClass,
|
|
9
10
|
SystemMetadataClass,
|
|
10
11
|
)
|
|
11
12
|
from datahub.metadata.urns import SchemaFieldUrn, Urn
|
|
12
13
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
14
|
+
from datahub.specific.aspect_helpers.fine_grained_lineage import (
|
|
15
|
+
HasFineGrainedLineagePatch,
|
|
16
|
+
)
|
|
13
17
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
14
18
|
from datahub.specific.aspect_helpers.tags import HasTagsPatch
|
|
15
19
|
from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
@@ -20,6 +24,7 @@ class DataJobPatchBuilder(
|
|
|
20
24
|
HasCustomPropertiesPatch,
|
|
21
25
|
HasTagsPatch,
|
|
22
26
|
HasTermsPatch,
|
|
27
|
+
HasFineGrainedLineagePatch,
|
|
23
28
|
MetadataPatchProposal,
|
|
24
29
|
):
|
|
25
30
|
def __init__(
|
|
@@ -40,10 +45,19 @@ class DataJobPatchBuilder(
|
|
|
40
45
|
urn, system_metadata=system_metadata, audit_header=audit_header
|
|
41
46
|
)
|
|
42
47
|
|
|
48
|
+
# Track fine-grained lineages for DataJob-specific handling
|
|
49
|
+
self._fine_grained_lineages_to_add: List[FineGrainedLineage] = []
|
|
50
|
+
self._fine_grained_lineage_keys_to_remove: Set[Tuple[str, str, str]] = set()
|
|
51
|
+
self._fine_grained_lineages_set: Optional[List[FineGrainedLineage]] = None
|
|
52
|
+
|
|
43
53
|
@classmethod
|
|
44
54
|
def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
|
|
45
55
|
return DataJobInfo.ASPECT_NAME, ("customProperties",)
|
|
46
56
|
|
|
57
|
+
@classmethod
|
|
58
|
+
def _fine_grained_lineage_location(cls) -> Tuple[str, PatchPath]:
|
|
59
|
+
return DataJobInputOutput.ASPECT_NAME, ("fineGrainedLineages",)
|
|
60
|
+
|
|
47
61
|
def add_input_datajob(self, input: Union[Edge, Urn, str]) -> "DataJobPatchBuilder":
|
|
48
62
|
"""
|
|
49
63
|
Adds an input data job to the DataJobPatchBuilder.
|
datahub/specific/dataproduct.py
CHANGED
|
@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
|
|
|
9
9
|
)
|
|
10
10
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
11
11
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
12
|
+
from datahub.specific.aspect_helpers.structured_properties import (
|
|
13
|
+
HasStructuredPropertiesPatch,
|
|
14
|
+
)
|
|
12
15
|
from datahub.specific.aspect_helpers.tags import HasTagsPatch
|
|
13
16
|
from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
14
17
|
|
|
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
|
16
19
|
class DataProductPatchBuilder(
|
|
17
20
|
HasOwnershipPatch,
|
|
18
21
|
HasCustomPropertiesPatch,
|
|
22
|
+
HasStructuredPropertiesPatch,
|
|
19
23
|
HasTagsPatch,
|
|
20
24
|
HasTermsPatch,
|
|
21
25
|
MetadataPatchProposal,
|
datahub/specific/dataset.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from typing import Generic, List, Optional, Tuple, TypeVar, Union
|
|
2
3
|
|
|
3
4
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
|
|
@@ -17,7 +18,11 @@ from datahub.metadata.schema_classes import (
|
|
|
17
18
|
)
|
|
18
19
|
from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
|
|
19
20
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
21
|
+
from datahub.specific.aspect_helpers.fine_grained_lineage import (
|
|
22
|
+
HasFineGrainedLineagePatch,
|
|
23
|
+
)
|
|
20
24
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
25
|
+
from datahub.specific.aspect_helpers.siblings import HasSiblingsPatch
|
|
21
26
|
from datahub.specific.aspect_helpers.structured_properties import (
|
|
22
27
|
HasStructuredPropertiesPatch,
|
|
23
28
|
)
|
|
@@ -99,6 +104,8 @@ class DatasetPatchBuilder(
|
|
|
99
104
|
HasStructuredPropertiesPatch,
|
|
100
105
|
HasTagsPatch,
|
|
101
106
|
HasTermsPatch,
|
|
107
|
+
HasFineGrainedLineagePatch,
|
|
108
|
+
HasSiblingsPatch,
|
|
102
109
|
MetadataPatchProposal,
|
|
103
110
|
):
|
|
104
111
|
def __init__(
|
|
@@ -115,6 +122,10 @@ class DatasetPatchBuilder(
|
|
|
115
122
|
def _custom_properties_location(cls) -> Tuple[str, PatchPath]:
|
|
116
123
|
return DatasetProperties.ASPECT_NAME, ("customProperties",)
|
|
117
124
|
|
|
125
|
+
@classmethod
|
|
126
|
+
def _fine_grained_lineage_location(cls) -> Tuple[str, PatchPath]:
|
|
127
|
+
return UpstreamLineage.ASPECT_NAME, ("fineGrainedLineages",)
|
|
128
|
+
|
|
118
129
|
def add_upstream_lineage(self, upstream: Upstream) -> "DatasetPatchBuilder":
|
|
119
130
|
self._add_patch(
|
|
120
131
|
UpstreamLineage.ASPECT_NAME,
|
|
@@ -144,75 +155,44 @@ class DatasetPatchBuilder(
|
|
|
144
155
|
def add_fine_grained_upstream_lineage(
|
|
145
156
|
self, fine_grained_lineage: FineGrainedLineage
|
|
146
157
|
) -> "DatasetPatchBuilder":
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
"add",
|
|
156
|
-
path=self._build_fine_grained_path(
|
|
157
|
-
transform_op, downstream_urn, query_id, upstream_urn
|
|
158
|
-
),
|
|
159
|
-
value={"confidenceScore": fine_grained_lineage.confidenceScore},
|
|
160
|
-
)
|
|
161
|
-
return self
|
|
162
|
-
|
|
163
|
-
@staticmethod
|
|
164
|
-
def get_fine_grained_key(
|
|
165
|
-
fine_grained_lineage: FineGrainedLineage,
|
|
166
|
-
) -> Tuple[str, str, str]:
|
|
167
|
-
downstreams = fine_grained_lineage.downstreams or []
|
|
168
|
-
if len(downstreams) != 1:
|
|
169
|
-
raise TypeError("Cannot patch with more or less than one downstream.")
|
|
170
|
-
transform_op = fine_grained_lineage.transformOperation or "NONE"
|
|
171
|
-
downstream_urn = downstreams[0]
|
|
172
|
-
query_id = fine_grained_lineage.query or "NONE"
|
|
173
|
-
return transform_op, downstream_urn, query_id
|
|
174
|
-
|
|
175
|
-
@classmethod
|
|
176
|
-
def _build_fine_grained_path(
|
|
177
|
-
cls, transform_op: str, downstream_urn: str, query_id: str, upstream_urn: str
|
|
178
|
-
) -> PatchPath:
|
|
179
|
-
return (
|
|
180
|
-
"fineGrainedLineages",
|
|
181
|
-
transform_op,
|
|
182
|
-
downstream_urn,
|
|
183
|
-
query_id,
|
|
184
|
-
upstream_urn,
|
|
158
|
+
"""
|
|
159
|
+
Deprecated: Use `add_fine_grained_lineage` instead.
|
|
160
|
+
"""
|
|
161
|
+
warnings.warn(
|
|
162
|
+
"add_fine_grained_upstream_lineage() is deprecated."
|
|
163
|
+
" Use add_fine_grained_lineage() instead.",
|
|
164
|
+
DeprecationWarning,
|
|
165
|
+
stacklevel=2,
|
|
185
166
|
)
|
|
167
|
+
return self.add_fine_grained_lineage(fine_grained_lineage)
|
|
186
168
|
|
|
187
169
|
def remove_fine_grained_upstream_lineage(
|
|
188
170
|
self, fine_grained_lineage: FineGrainedLineage
|
|
189
171
|
) -> "DatasetPatchBuilder":
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
transform_op, downstream_urn, query_id, upstream_urn
|
|
201
|
-
),
|
|
202
|
-
value={},
|
|
203
|
-
)
|
|
204
|
-
return self
|
|
172
|
+
"""
|
|
173
|
+
Deprecated: Use `remove_fine_grained_lineage` instead.
|
|
174
|
+
"""
|
|
175
|
+
warnings.warn(
|
|
176
|
+
"remove_fine_grained_upstream_lineage() is deprecated."
|
|
177
|
+
" Use remove_fine_grained_lineage() instead.",
|
|
178
|
+
DeprecationWarning,
|
|
179
|
+
stacklevel=2,
|
|
180
|
+
)
|
|
181
|
+
return self.remove_fine_grained_lineage(fine_grained_lineage)
|
|
205
182
|
|
|
206
183
|
def set_fine_grained_upstream_lineages(
|
|
207
184
|
self, fine_grained_lineages: List[FineGrainedLineage]
|
|
208
185
|
) -> "DatasetPatchBuilder":
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
186
|
+
"""
|
|
187
|
+
Deprecated: Use `set_fine_grained_lineages` instead.
|
|
188
|
+
"""
|
|
189
|
+
warnings.warn(
|
|
190
|
+
"set_fine_grained_upstream_lineages() is deprecated."
|
|
191
|
+
" Use set_fine_grained_lineages() instead.",
|
|
192
|
+
DeprecationWarning,
|
|
193
|
+
stacklevel=2,
|
|
214
194
|
)
|
|
215
|
-
return self
|
|
195
|
+
return self.set_fine_grained_lineages(fine_grained_lineages)
|
|
216
196
|
|
|
217
197
|
def for_field(
|
|
218
198
|
self, field_path: str, editable: bool = True
|
|
@@ -52,6 +52,7 @@ class ParserState(Enum):
|
|
|
52
52
|
STRING = 2
|
|
53
53
|
COMMENT = 3
|
|
54
54
|
MULTILINE_COMMENT = 4
|
|
55
|
+
BRACKETED_IDENTIFIER = 5
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
class _StatementSplitter:
|
|
@@ -141,6 +142,10 @@ class _StatementSplitter:
|
|
|
141
142
|
self.state = ParserState.STRING
|
|
142
143
|
self.current_statement.append(c)
|
|
143
144
|
prev_real_char = c
|
|
145
|
+
elif c == "[":
|
|
146
|
+
self.state = ParserState.BRACKETED_IDENTIFIER
|
|
147
|
+
self.current_statement.append(c)
|
|
148
|
+
prev_real_char = c
|
|
144
149
|
elif c == "-" and next_char == "-":
|
|
145
150
|
self.state = ParserState.COMMENT
|
|
146
151
|
self.current_statement.append(c)
|
|
@@ -172,6 +177,14 @@ class _StatementSplitter:
|
|
|
172
177
|
elif c == "'":
|
|
173
178
|
self.state = ParserState.NORMAL
|
|
174
179
|
|
|
180
|
+
elif self.state == ParserState.BRACKETED_IDENTIFIER:
|
|
181
|
+
self.current_statement.append(c)
|
|
182
|
+
if c == "]" and next_char == "]":
|
|
183
|
+
self.current_statement.append(next_char)
|
|
184
|
+
self.i += 1
|
|
185
|
+
elif c == "]":
|
|
186
|
+
self.state = ParserState.NORMAL
|
|
187
|
+
|
|
175
188
|
elif self.state == ParserState.COMMENT:
|
|
176
189
|
self.current_statement.append(c)
|
|
177
190
|
if c == "\n":
|
|
@@ -4,7 +4,6 @@ import enum
|
|
|
4
4
|
import functools
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import os
|
|
8
7
|
import pathlib
|
|
9
8
|
import tempfile
|
|
10
9
|
import uuid
|
|
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
|
|
|
14
13
|
|
|
15
14
|
import datahub.emitter.mce_builder as builder
|
|
16
15
|
import datahub.metadata.schema_classes as models
|
|
16
|
+
from datahub.configuration.env_vars import get_sql_agg_query_log
|
|
17
17
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
18
18
|
from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
|
-
from datahub.emitter.sql_parsing_builder import compute_upstream_fields
|
|
21
20
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
21
|
from datahub.ingestion.api.report import Report
|
|
23
22
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -49,6 +48,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
49
48
|
sqlglot_lineage,
|
|
50
49
|
)
|
|
51
50
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
51
|
+
DialectOrStr,
|
|
52
52
|
_parse_statement,
|
|
53
53
|
get_query_fingerprint,
|
|
54
54
|
try_format_query,
|
|
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
|
|
|
58
58
|
ToolMetaExtractorReport,
|
|
59
59
|
)
|
|
60
60
|
from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
|
|
61
|
+
from datahub.utilities.dedup_list import deduplicate_list
|
|
61
62
|
from datahub.utilities.file_backed_collections import (
|
|
62
63
|
ConnectionWrapper,
|
|
63
64
|
FileBackedDict,
|
|
@@ -82,7 +83,7 @@ class QueryLogSetting(enum.Enum):
|
|
|
82
83
|
_DEFAULT_USER_URN = CorpUserUrn("_ingestion")
|
|
83
84
|
_MISSING_SESSION_ID = "__MISSING_SESSION_ID"
|
|
84
85
|
_DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
|
|
85
|
-
|
|
86
|
+
get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
|
|
86
87
|
]
|
|
87
88
|
MAX_UPSTREAM_TABLES_COUNT = 300
|
|
88
89
|
MAX_FINEGRAINEDLINEAGE_COUNT = 2000
|
|
@@ -108,6 +109,7 @@ class ObservedQuery:
|
|
|
108
109
|
default_schema: Optional[str] = None
|
|
109
110
|
query_hash: Optional[str] = None
|
|
110
111
|
usage_multiplier: int = 1
|
|
112
|
+
override_dialect: Optional[DialectOrStr] = None
|
|
111
113
|
|
|
112
114
|
# Use this to store additional key-value information about the query for debugging.
|
|
113
115
|
extra_info: Optional[dict] = None
|
|
@@ -140,6 +142,7 @@ class QueryMetadata:
|
|
|
140
142
|
|
|
141
143
|
used_temp_tables: bool = True
|
|
142
144
|
|
|
145
|
+
extra_info: Optional[dict] = None
|
|
143
146
|
origin: Optional[Urn] = None
|
|
144
147
|
|
|
145
148
|
def make_created_audit_stamp(self) -> models.AuditStampClass:
|
|
@@ -188,6 +191,7 @@ class QueryMetadata:
|
|
|
188
191
|
source=models.QuerySourceClass.SYSTEM,
|
|
189
192
|
created=self.make_created_audit_stamp(),
|
|
190
193
|
lastModified=self.make_last_modified_audit_stamp(),
|
|
194
|
+
origin=self.origin.urn() if self.origin else None,
|
|
191
195
|
)
|
|
192
196
|
|
|
193
197
|
|
|
@@ -263,7 +267,7 @@ class PreparsedQuery:
|
|
|
263
267
|
query_type_props: QueryTypeProps = dataclasses.field(
|
|
264
268
|
default_factory=lambda: QueryTypeProps()
|
|
265
269
|
)
|
|
266
|
-
# Use this to store
|
|
270
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
267
271
|
extra_info: Optional[dict] = None
|
|
268
272
|
origin: Optional[Urn] = None
|
|
269
273
|
|
|
@@ -629,6 +633,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
629
633
|
TableSwap,
|
|
630
634
|
],
|
|
631
635
|
) -> None:
|
|
636
|
+
"""
|
|
637
|
+
This assumes that queries come in order of increasing timestamps.
|
|
638
|
+
"""
|
|
632
639
|
if isinstance(item, KnownQueryLineageInfo):
|
|
633
640
|
self.add_known_query_lineage(item)
|
|
634
641
|
elif isinstance(item, KnownLineageMapping):
|
|
@@ -831,6 +838,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
831
838
|
session_id=session_id,
|
|
832
839
|
timestamp=observed.timestamp,
|
|
833
840
|
user=observed.user,
|
|
841
|
+
override_dialect=observed.override_dialect,
|
|
834
842
|
)
|
|
835
843
|
if parsed.debug_info.error:
|
|
836
844
|
self.report.observed_query_parse_failures.append(
|
|
@@ -859,7 +867,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
859
867
|
downstream=parsed.out_tables[0] if parsed.out_tables else None,
|
|
860
868
|
column_lineage=parsed.column_lineage,
|
|
861
869
|
# TODO: We need a full list of columns referenced, not just the out tables.
|
|
862
|
-
column_usage=
|
|
870
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
863
871
|
inferred_schema=infer_output_schema(parsed),
|
|
864
872
|
confidence_score=parsed.debug_info.confidence,
|
|
865
873
|
extra_info=observed.extra_info,
|
|
@@ -948,6 +956,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
948
956
|
column_usage=parsed.column_usage or {},
|
|
949
957
|
confidence_score=parsed.confidence_score,
|
|
950
958
|
used_temp_tables=session_has_temp_tables,
|
|
959
|
+
extra_info=parsed.extra_info,
|
|
951
960
|
origin=parsed.origin,
|
|
952
961
|
)
|
|
953
962
|
)
|
|
@@ -1147,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1147
1156
|
actor=None,
|
|
1148
1157
|
upstreams=parsed.in_tables,
|
|
1149
1158
|
column_lineage=parsed.column_lineage or [],
|
|
1150
|
-
column_usage=
|
|
1159
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
1151
1160
|
confidence_score=parsed.debug_info.confidence,
|
|
1152
1161
|
)
|
|
1153
1162
|
)
|
|
@@ -1164,6 +1173,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1164
1173
|
session_id: str = _MISSING_SESSION_ID,
|
|
1165
1174
|
timestamp: Optional[datetime] = None,
|
|
1166
1175
|
user: Optional[Union[CorpUserUrn, CorpGroupUrn]] = None,
|
|
1176
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1167
1177
|
) -> SqlParsingResult:
|
|
1168
1178
|
with self.report.sql_parsing_timer:
|
|
1169
1179
|
parsed = sqlglot_lineage(
|
|
@@ -1171,6 +1181,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1171
1181
|
schema_resolver=schema_resolver,
|
|
1172
1182
|
default_db=default_db,
|
|
1173
1183
|
default_schema=default_schema,
|
|
1184
|
+
override_dialect=override_dialect,
|
|
1174
1185
|
)
|
|
1175
1186
|
self.report.num_sql_parsed += 1
|
|
1176
1187
|
|
|
@@ -1329,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
|
|
|
1329
1340
|
upstreams.setdefault(upstream, query.query_id)
|
|
1330
1341
|
|
|
1331
1342
|
for lineage_info in query.column_lineage:
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1343
|
+
if (
|
|
1344
|
+
not lineage_info.downstream.column
|
|
1345
|
+
or not lineage_info.downstream.column.strip()
|
|
1346
|
+
):
|
|
1347
|
+
logger.debug(
|
|
1348
|
+
f"Skipping lineage entry with empty downstream column in query {query.query_id}"
|
|
1336
1349
|
)
|
|
1350
|
+
continue
|
|
1351
|
+
|
|
1352
|
+
for upstream_ref in lineage_info.upstreams:
|
|
1353
|
+
if upstream_ref.column and upstream_ref.column.strip():
|
|
1354
|
+
cll[lineage_info.downstream.column].setdefault(
|
|
1355
|
+
SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
|
|
1356
|
+
query.query_id,
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
logger.debug(
|
|
1360
|
+
f"Skipping empty column reference in lineage for query {query.query_id}"
|
|
1361
|
+
)
|
|
1337
1362
|
|
|
1338
1363
|
# Finally, we can build our lineage edge.
|
|
1339
1364
|
required_queries = OrderedSet[QueryId]()
|
|
@@ -1491,9 +1516,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1491
1516
|
return
|
|
1492
1517
|
|
|
1493
1518
|
# If a query doesn't involve any allowed tables, skip it.
|
|
1494
|
-
if
|
|
1495
|
-
self.is_allowed_table(
|
|
1496
|
-
):
|
|
1519
|
+
if (
|
|
1520
|
+
downstream_urn is None or not self.is_allowed_table(downstream_urn)
|
|
1521
|
+
) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
|
|
1497
1522
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1498
1523
|
return
|
|
1499
1524
|
|
|
@@ -1574,27 +1599,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1574
1599
|
|
|
1575
1600
|
@dataclasses.dataclass
|
|
1576
1601
|
class QueryLineageInfo:
|
|
1577
|
-
upstreams:
|
|
1578
|
-
|
|
1602
|
+
upstreams: OrderedSet[
|
|
1603
|
+
UrnStr
|
|
1604
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1605
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1579
1606
|
confidence_score: float
|
|
1580
1607
|
|
|
1581
1608
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1582
|
-
self.upstreams
|
|
1583
|
-
self.column_lineage
|
|
1609
|
+
self.upstreams.update(other_query.upstreams)
|
|
1610
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1584
1611
|
self.confidence_score = min(
|
|
1585
1612
|
self.confidence_score, other_query.confidence_score
|
|
1586
1613
|
)
|
|
1587
1614
|
|
|
1615
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1616
|
+
|
|
1588
1617
|
def _recurse_into_query(
|
|
1589
1618
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1590
1619
|
) -> QueryLineageInfo:
|
|
1591
1620
|
if query.query_id in recursion_path:
|
|
1592
1621
|
# This is a cycle, so we just return the query as-is.
|
|
1593
1622
|
return QueryLineageInfo(
|
|
1594
|
-
upstreams=query.upstreams,
|
|
1595
|
-
column_lineage=query.column_lineage,
|
|
1623
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1624
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1596
1625
|
confidence_score=query.confidence_score,
|
|
1597
1626
|
)
|
|
1627
|
+
if query.query_id in cache:
|
|
1628
|
+
return cache[query.query_id]
|
|
1598
1629
|
recursion_path = [*recursion_path, query.query_id]
|
|
1599
1630
|
composed_of_queries.add(query.query_id)
|
|
1600
1631
|
|
|
@@ -1609,7 +1640,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1609
1640
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1610
1641
|
if (
|
|
1611
1642
|
upstream_query
|
|
1612
|
-
and upstream_query.query_id not in
|
|
1643
|
+
and upstream_query.query_id not in recursion_path
|
|
1613
1644
|
):
|
|
1614
1645
|
temp_query_lineage_info = _recurse_into_query(
|
|
1615
1646
|
upstream_query, recursion_path
|
|
@@ -1669,11 +1700,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1669
1700
|
]
|
|
1670
1701
|
)
|
|
1671
1702
|
|
|
1672
|
-
|
|
1673
|
-
upstreams=
|
|
1674
|
-
column_lineage=new_cll,
|
|
1703
|
+
ret = QueryLineageInfo(
|
|
1704
|
+
upstreams=new_upstreams,
|
|
1705
|
+
column_lineage=OrderedSet(new_cll),
|
|
1675
1706
|
confidence_score=new_confidence_score,
|
|
1676
1707
|
)
|
|
1708
|
+
cache[query.query_id] = ret
|
|
1709
|
+
|
|
1710
|
+
return ret
|
|
1677
1711
|
|
|
1678
1712
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1679
1713
|
|
|
@@ -1706,20 +1740,30 @@ class SqlParsingAggregator(Closeable):
|
|
|
1706
1740
|
)
|
|
1707
1741
|
|
|
1708
1742
|
merged_query_text = ";\n\n".join(
|
|
1709
|
-
[q.formatted_query_string for q in ordered_queries]
|
|
1743
|
+
deduplicate_list([q.formatted_query_string for q in ordered_queries])
|
|
1710
1744
|
)
|
|
1711
1745
|
|
|
1712
1746
|
resolved_query = dataclasses.replace(
|
|
1713
1747
|
base_query,
|
|
1714
1748
|
query_id=composite_query_id,
|
|
1715
1749
|
formatted_query_string=merged_query_text,
|
|
1716
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1717
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1750
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1751
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1718
1752
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1719
1753
|
)
|
|
1720
1754
|
|
|
1721
1755
|
return resolved_query
|
|
1722
1756
|
|
|
1757
|
+
@staticmethod
|
|
1758
|
+
def _compute_upstream_fields(
|
|
1759
|
+
result: SqlParsingResult,
|
|
1760
|
+
) -> Dict[UrnStr, Set[UrnStr]]:
|
|
1761
|
+
upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
|
|
1762
|
+
for cl in result.column_lineage or []:
|
|
1763
|
+
for upstream in cl.upstreams:
|
|
1764
|
+
upstream_fields[upstream.table].add(upstream.column)
|
|
1765
|
+
return upstream_fields
|
|
1766
|
+
|
|
1723
1767
|
def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1724
1768
|
if not self._usage_aggregator:
|
|
1725
1769
|
return
|