acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
|
|
|
56
56
|
QueryTypeProps,
|
|
57
57
|
)
|
|
58
58
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
59
|
+
DialectOrStr,
|
|
59
60
|
get_dialect,
|
|
60
61
|
get_query_fingerprint_debug,
|
|
61
62
|
is_dialect_instance,
|
|
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
|
|
|
124
125
|
|
|
125
126
|
|
|
126
127
|
class DownstreamColumnRef(_ParserBaseModel):
|
|
128
|
+
"""
|
|
129
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
130
|
+
What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
|
|
131
|
+
auto-generated class from .pdl model files. We need generic solution allowing us to either:
|
|
132
|
+
1. Implement hashing for .pdl model objects
|
|
133
|
+
2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
|
|
134
|
+
hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
|
|
135
|
+
to understand that instruction as well.
|
|
136
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
137
|
+
"""
|
|
138
|
+
|
|
127
139
|
table: Optional[Urn] = None
|
|
128
140
|
column: str
|
|
129
141
|
column_type: Optional[SchemaFieldDataTypeClass] = None
|
|
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
|
|
|
139
151
|
return v
|
|
140
152
|
return SchemaFieldDataTypeClass.from_obj(v)
|
|
141
153
|
|
|
154
|
+
def __hash__(self) -> int:
|
|
155
|
+
return hash((self.table, self.column, self.native_column_type))
|
|
142
156
|
|
|
143
|
-
|
|
157
|
+
|
|
158
|
+
class ColumnTransformation(_FrozenModel):
|
|
144
159
|
is_direct_copy: bool
|
|
145
160
|
column_logic: str
|
|
146
161
|
|
|
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
|
|
|
153
168
|
|
|
154
169
|
|
|
155
170
|
class ColumnLineageInfo(_ParserBaseModel):
|
|
171
|
+
"""
|
|
172
|
+
TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
|
|
173
|
+
To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
|
|
174
|
+
depending on it.
|
|
175
|
+
Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
|
|
176
|
+
"""
|
|
177
|
+
|
|
156
178
|
downstream: DownstreamColumnRef
|
|
157
179
|
upstreams: List[ColumnRef]
|
|
158
180
|
|
|
159
181
|
logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
|
|
160
182
|
|
|
183
|
+
def __hash__(self) -> int:
|
|
184
|
+
return hash((self.downstream, tuple(self.upstreams), self.logic))
|
|
185
|
+
|
|
161
186
|
|
|
162
187
|
class _JoinInfo(_ParserBaseModel):
|
|
163
188
|
join_type: str
|
|
@@ -518,16 +543,19 @@ def _select_statement_cll(
|
|
|
518
543
|
root_scope: sqlglot.optimizer.Scope,
|
|
519
544
|
column_resolver: _ColumnResolver,
|
|
520
545
|
output_table: Optional[_TableName],
|
|
546
|
+
table_name_schema_mapping: Dict[_TableName, SchemaInfo],
|
|
547
|
+
default_db: Optional[str] = None,
|
|
548
|
+
default_schema: Optional[str] = None,
|
|
521
549
|
) -> List[_ColumnLineageInfo]:
|
|
522
550
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
523
551
|
|
|
524
552
|
try:
|
|
525
|
-
# List output columns.
|
|
526
553
|
output_columns = [
|
|
527
554
|
(select_col.alias_or_name, select_col) for select_col in statement.selects
|
|
528
555
|
]
|
|
529
556
|
logger.debug("output columns: %s", [col[0] for col in output_columns])
|
|
530
|
-
|
|
557
|
+
|
|
558
|
+
for output_col, _original_col_expression in output_columns:
|
|
531
559
|
if not output_col or output_col == "*":
|
|
532
560
|
# If schema information is available, the * will be expanded to the actual columns.
|
|
533
561
|
# Otherwise, we can't process it.
|
|
@@ -551,13 +579,14 @@ def _select_statement_cll(
|
|
|
551
579
|
trim_selects=False,
|
|
552
580
|
# We don't need to pass the schema in here, since we've already qualified the columns.
|
|
553
581
|
)
|
|
554
|
-
# import pathlib
|
|
555
|
-
# pathlib.Path("sqlglot.html").write_text(
|
|
556
|
-
# str(lineage_node.to_html(dialect=dialect))
|
|
557
|
-
# )
|
|
558
582
|
|
|
559
583
|
# Generate SELECT lineage.
|
|
560
|
-
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
584
|
+
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
585
|
+
lineage_node,
|
|
586
|
+
dialect,
|
|
587
|
+
default_db,
|
|
588
|
+
default_schema,
|
|
589
|
+
)
|
|
561
590
|
|
|
562
591
|
# Fuzzy resolve the output column.
|
|
563
592
|
original_col_expression = lineage_node.expression
|
|
@@ -576,7 +605,7 @@ def _select_statement_cll(
|
|
|
576
605
|
if original_col_expression.type:
|
|
577
606
|
output_col_type = original_col_expression.type
|
|
578
607
|
|
|
579
|
-
#
|
|
608
|
+
# Resolve upstream columns - table names should already be qualified from placeholder processing
|
|
580
609
|
direct_resolved_col_upstreams = {
|
|
581
610
|
_ColumnRef(
|
|
582
611
|
table=edge.table,
|
|
@@ -662,6 +691,13 @@ def _column_level_lineage(
|
|
|
662
691
|
select_statement=select_statement,
|
|
663
692
|
)
|
|
664
693
|
|
|
694
|
+
# Handle VALUES expressions separately - they have no upstream tables and no column lineage
|
|
695
|
+
if isinstance(select_statement, sqlglot.exp.Values):
|
|
696
|
+
return _ColumnLineageWithDebugInfo(
|
|
697
|
+
column_lineage=[],
|
|
698
|
+
select_statement=select_statement,
|
|
699
|
+
)
|
|
700
|
+
|
|
665
701
|
assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
|
|
666
702
|
try:
|
|
667
703
|
root_scope = sqlglot.optimizer.build_scope(select_statement)
|
|
@@ -681,6 +717,9 @@ def _column_level_lineage(
|
|
|
681
717
|
root_scope=root_scope,
|
|
682
718
|
column_resolver=column_resolver,
|
|
683
719
|
output_table=downstream_table,
|
|
720
|
+
table_name_schema_mapping=table_name_schema_mapping,
|
|
721
|
+
default_db=default_db,
|
|
722
|
+
default_schema=default_schema,
|
|
684
723
|
)
|
|
685
724
|
|
|
686
725
|
joins: Optional[List[_JoinInfo]] = None
|
|
@@ -701,6 +740,9 @@ def _column_level_lineage(
|
|
|
701
740
|
|
|
702
741
|
def _get_direct_raw_col_upstreams(
|
|
703
742
|
lineage_node: sqlglot.lineage.Node,
|
|
743
|
+
dialect: Optional[sqlglot.Dialect] = None,
|
|
744
|
+
default_db: Optional[str] = None,
|
|
745
|
+
default_schema: Optional[str] = None,
|
|
704
746
|
) -> OrderedSet[_ColumnRef]:
|
|
705
747
|
# Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
|
|
706
748
|
direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
|
|
@@ -730,6 +772,53 @@ def _get_direct_raw_col_upstreams(
|
|
|
730
772
|
direct_raw_col_upstreams.add(
|
|
731
773
|
_ColumnRef(table=table_ref, column=normalized_col)
|
|
732
774
|
)
|
|
775
|
+
elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
|
|
776
|
+
# Handle placeholder expressions from lateral joins.
|
|
777
|
+
#
|
|
778
|
+
# In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
|
|
779
|
+
# expressions instead of regular table references. This is critical for lateral join column lineage.
|
|
780
|
+
#
|
|
781
|
+
# Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
|
|
782
|
+
# The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
|
|
783
|
+
# which we need to parse to establish the lineage: output.value <- my_table2.value
|
|
784
|
+
#
|
|
785
|
+
# Without this handling, lateral join column lineage would be incomplete/missing.
|
|
786
|
+
try:
|
|
787
|
+
parsed = sqlglot.parse_one(node.name, dialect=dialect)
|
|
788
|
+
if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
|
|
789
|
+
table_ref = _TableName.from_sqlglot_table(
|
|
790
|
+
sqlglot.parse_one(
|
|
791
|
+
parsed.table, into=sqlglot.exp.Table, dialect=dialect
|
|
792
|
+
)
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
|
|
796
|
+
# Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
|
|
797
|
+
# We need this runtime qualification to ensure proper lineage resolution.
|
|
798
|
+
# Only qualify if this appears to be a real table reference (not a temporary construct)
|
|
799
|
+
if (
|
|
800
|
+
not (table_ref.database or table_ref.db_schema)
|
|
801
|
+
and dialect is not None
|
|
802
|
+
):
|
|
803
|
+
table_ref = table_ref.qualified(
|
|
804
|
+
dialect=dialect,
|
|
805
|
+
default_db=default_db,
|
|
806
|
+
default_schema=default_schema,
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# Extract column name using proper isinstance check
|
|
810
|
+
if isinstance(parsed.this, sqlglot.exp.Identifier):
|
|
811
|
+
column_name = parsed.this.name
|
|
812
|
+
else:
|
|
813
|
+
column_name = str(parsed.this)
|
|
814
|
+
direct_raw_col_upstreams.add(
|
|
815
|
+
_ColumnRef(table=table_ref, column=column_name)
|
|
816
|
+
)
|
|
817
|
+
except Exception as e:
|
|
818
|
+
logger.debug(
|
|
819
|
+
f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
|
|
820
|
+
exc_info=True,
|
|
821
|
+
)
|
|
733
822
|
else:
|
|
734
823
|
# This branch doesn't matter. For example, a count(*) column would go here, and
|
|
735
824
|
# we don't get any column-level lineage for that.
|
|
@@ -832,7 +921,7 @@ def _get_raw_col_upstreams_for_expression(
|
|
|
832
921
|
trim_selects=False,
|
|
833
922
|
)
|
|
834
923
|
|
|
835
|
-
return _get_direct_raw_col_upstreams(node)
|
|
924
|
+
return _get_direct_raw_col_upstreams(node, dialect, None, None)
|
|
836
925
|
finally:
|
|
837
926
|
scope.expression = original_expression
|
|
838
927
|
|
|
@@ -847,8 +936,9 @@ def _list_joins(
|
|
|
847
936
|
|
|
848
937
|
scope: sqlglot.optimizer.Scope
|
|
849
938
|
for scope in root_scope.traverse():
|
|
939
|
+
# PART 1: Handle regular explicit JOINs (updated API)
|
|
850
940
|
join: sqlglot.exp.Join
|
|
851
|
-
for join in scope.find_all(sqlglot.exp.Join):
|
|
941
|
+
for join in scope.expression.find_all(sqlglot.exp.Join):
|
|
852
942
|
left_side_tables: OrderedSet[_TableName] = OrderedSet()
|
|
853
943
|
from_clause: sqlglot.exp.From
|
|
854
944
|
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
@@ -924,6 +1014,36 @@ def _list_joins(
|
|
|
924
1014
|
)
|
|
925
1015
|
)
|
|
926
1016
|
|
|
1017
|
+
# Handle LATERAL constructs
|
|
1018
|
+
for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
|
|
1019
|
+
# Get tables from non-lateral FROM clauses
|
|
1020
|
+
qualified_left: OrderedSet[_TableName] = OrderedSet()
|
|
1021
|
+
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
1022
|
+
if not isinstance(from_clause.this, sqlglot.exp.Lateral):
|
|
1023
|
+
qualified_left.update(
|
|
1024
|
+
_get_join_side_tables(from_clause.this, dialect, scope)
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
# Get tables from lateral subquery
|
|
1028
|
+
qualified_right: OrderedSet[_TableName] = OrderedSet()
|
|
1029
|
+
if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
|
|
1030
|
+
qualified_right.update(
|
|
1031
|
+
_TableName.from_sqlglot_table(t)
|
|
1032
|
+
for t in lateral.this.find_all(sqlglot.exp.Table)
|
|
1033
|
+
)
|
|
1034
|
+
qualified_right.update(qualified_left)
|
|
1035
|
+
|
|
1036
|
+
if qualified_left and qualified_right:
|
|
1037
|
+
joins.append(
|
|
1038
|
+
_JoinInfo(
|
|
1039
|
+
join_type="LATERAL JOIN",
|
|
1040
|
+
left_tables=list(qualified_left),
|
|
1041
|
+
right_tables=list(qualified_right),
|
|
1042
|
+
on_clause=None,
|
|
1043
|
+
columns_involved=[],
|
|
1044
|
+
)
|
|
1045
|
+
)
|
|
1046
|
+
|
|
927
1047
|
return joins
|
|
928
1048
|
|
|
929
1049
|
|
|
@@ -1063,7 +1183,12 @@ def _try_extract_select(
|
|
|
1063
1183
|
statement = sqlglot.exp.Select().select("*").from_(statement)
|
|
1064
1184
|
elif isinstance(statement, sqlglot.exp.Insert):
|
|
1065
1185
|
# TODO Need to map column renames in the expressions part of the statement.
|
|
1066
|
-
|
|
1186
|
+
# Preserve CTEs when extracting the SELECT expression from INSERT
|
|
1187
|
+
original_ctes = statement.ctes
|
|
1188
|
+
statement = statement.expression # Get the SELECT expression from the INSERT
|
|
1189
|
+
if isinstance(statement, sqlglot.exp.Query) and original_ctes:
|
|
1190
|
+
for cte in original_ctes:
|
|
1191
|
+
statement = statement.with_(alias=cte.alias, as_=cte.this)
|
|
1067
1192
|
elif isinstance(statement, sqlglot.exp.Update):
|
|
1068
1193
|
# Assumption: the output table is already captured in the modified tables list.
|
|
1069
1194
|
statement = _extract_select_from_update(statement)
|
|
@@ -1161,25 +1286,30 @@ def _translate_internal_joins(
|
|
|
1161
1286
|
) -> List[JoinInfo]:
|
|
1162
1287
|
joins = []
|
|
1163
1288
|
for raw_join in raw_joins:
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1289
|
+
try:
|
|
1290
|
+
joins.append(
|
|
1291
|
+
JoinInfo(
|
|
1292
|
+
join_type=raw_join.join_type,
|
|
1293
|
+
left_tables=[
|
|
1294
|
+
table_name_urn_mapping[table] for table in raw_join.left_tables
|
|
1295
|
+
],
|
|
1296
|
+
right_tables=[
|
|
1297
|
+
table_name_urn_mapping[table] for table in raw_join.right_tables
|
|
1298
|
+
],
|
|
1299
|
+
on_clause=raw_join.on_clause,
|
|
1300
|
+
columns_involved=[
|
|
1301
|
+
ColumnRef(
|
|
1302
|
+
table=table_name_urn_mapping[col.table],
|
|
1303
|
+
column=col.column,
|
|
1304
|
+
)
|
|
1305
|
+
for col in raw_join.columns_involved
|
|
1306
|
+
],
|
|
1307
|
+
)
|
|
1181
1308
|
)
|
|
1182
|
-
|
|
1309
|
+
except KeyError as e:
|
|
1310
|
+
# Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
|
|
1311
|
+
logger.debug(f"Skipping join with unresolvable table: {e}")
|
|
1312
|
+
continue
|
|
1183
1313
|
return joins
|
|
1184
1314
|
|
|
1185
1315
|
|
|
@@ -1231,12 +1361,12 @@ def _sqlglot_lineage_inner(
|
|
|
1231
1361
|
schema_resolver: SchemaResolverInterface,
|
|
1232
1362
|
default_db: Optional[str] = None,
|
|
1233
1363
|
default_schema: Optional[str] = None,
|
|
1234
|
-
|
|
1364
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1235
1365
|
) -> SqlParsingResult:
|
|
1236
|
-
if
|
|
1237
|
-
dialect = get_dialect(
|
|
1366
|
+
if override_dialect:
|
|
1367
|
+
dialect = get_dialect(override_dialect)
|
|
1238
1368
|
else:
|
|
1239
|
-
dialect = get_dialect(
|
|
1369
|
+
dialect = get_dialect(schema_resolver.platform)
|
|
1240
1370
|
|
|
1241
1371
|
default_db = _normalize_db_or_schema(default_db, dialect)
|
|
1242
1372
|
default_schema = _normalize_db_or_schema(default_schema, dialect)
|
|
@@ -1423,7 +1553,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1423
1553
|
schema_resolver: SchemaResolverInterface,
|
|
1424
1554
|
default_db: Optional[str] = None,
|
|
1425
1555
|
default_schema: Optional[str] = None,
|
|
1426
|
-
|
|
1556
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1427
1557
|
) -> SqlParsingResult:
|
|
1428
1558
|
"""Parse a SQL statement and generate lineage information.
|
|
1429
1559
|
|
|
@@ -1441,8 +1571,8 @@ def _sqlglot_lineage_nocache(
|
|
|
1441
1571
|
can be brittle with respect to missing schema information and complex
|
|
1442
1572
|
SQL logic like UNNESTs.
|
|
1443
1573
|
|
|
1444
|
-
The SQL dialect
|
|
1445
|
-
be
|
|
1574
|
+
The SQL dialect will be inferred from the schema_resolver's platform.
|
|
1575
|
+
That inference can be overridden by passing an override_dialect argument.
|
|
1446
1576
|
The set of supported dialects is the same as sqlglot's. See their
|
|
1447
1577
|
`documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
|
|
1448
1578
|
for the full list.
|
|
@@ -1457,7 +1587,7 @@ def _sqlglot_lineage_nocache(
|
|
|
1457
1587
|
schema_resolver: The schema resolver to use for resolving table schemas.
|
|
1458
1588
|
default_db: The default database to use for unqualified table names.
|
|
1459
1589
|
default_schema: The default schema to use for unqualified table names.
|
|
1460
|
-
|
|
1590
|
+
override_dialect: Override the dialect provided by 'schema_resolver'.
|
|
1461
1591
|
|
|
1462
1592
|
Returns:
|
|
1463
1593
|
A SqlParsingResult object containing the parsed lineage information.
|
|
@@ -1482,10 +1612,32 @@ def _sqlglot_lineage_nocache(
|
|
|
1482
1612
|
schema_resolver=schema_resolver,
|
|
1483
1613
|
default_db=default_db,
|
|
1484
1614
|
default_schema=default_schema,
|
|
1485
|
-
|
|
1615
|
+
override_dialect=override_dialect,
|
|
1486
1616
|
)
|
|
1487
1617
|
except Exception as e:
|
|
1488
1618
|
return SqlParsingResult.make_from_error(e)
|
|
1619
|
+
except BaseException as e:
|
|
1620
|
+
# Check if this is a PanicException from SQLGlot's Rust tokenizer
|
|
1621
|
+
# We use runtime type checking instead of isinstance() because pyo3_runtime
|
|
1622
|
+
# is only available when sqlglot[rs] is installed and may not be importable
|
|
1623
|
+
# at module load time, but the exception can still be raised at runtime
|
|
1624
|
+
if (
|
|
1625
|
+
e.__class__.__name__ == "PanicException"
|
|
1626
|
+
and e.__class__.__module__ == "pyo3_runtime"
|
|
1627
|
+
):
|
|
1628
|
+
# Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
|
|
1629
|
+
# pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
|
|
1630
|
+
# KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
|
|
1631
|
+
# Avoid catching BaseException, as it includes KeyboardInterrupt
|
|
1632
|
+
# and would prevent Ctrl+C from working.
|
|
1633
|
+
wrapped_exception = Exception(
|
|
1634
|
+
f"pyo3_runtime.PanicException during SQL parsing: {e}"
|
|
1635
|
+
)
|
|
1636
|
+
wrapped_exception.__cause__ = e
|
|
1637
|
+
return SqlParsingResult.make_from_error(wrapped_exception)
|
|
1638
|
+
else:
|
|
1639
|
+
# Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
|
|
1640
|
+
raise
|
|
1489
1641
|
|
|
1490
1642
|
|
|
1491
1643
|
_sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
|
|
@@ -1498,15 +1650,15 @@ def sqlglot_lineage(
|
|
|
1498
1650
|
schema_resolver: SchemaResolverInterface,
|
|
1499
1651
|
default_db: Optional[str] = None,
|
|
1500
1652
|
default_schema: Optional[str] = None,
|
|
1501
|
-
|
|
1653
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1502
1654
|
) -> SqlParsingResult:
|
|
1503
1655
|
if schema_resolver.includes_temp_tables():
|
|
1504
1656
|
return _sqlglot_lineage_nocache(
|
|
1505
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1657
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1506
1658
|
)
|
|
1507
1659
|
else:
|
|
1508
1660
|
return _sqlglot_lineage_cached(
|
|
1509
|
-
sql, schema_resolver, default_db, default_schema,
|
|
1661
|
+
sql, schema_resolver, default_db, default_schema, override_dialect
|
|
1510
1662
|
)
|
|
1511
1663
|
|
|
1512
1664
|
|
|
@@ -1558,6 +1710,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1558
1710
|
default_schema: Optional[str] = None,
|
|
1559
1711
|
graph: Optional[DataHubGraph] = None,
|
|
1560
1712
|
schema_aware: bool = True,
|
|
1713
|
+
override_dialect: Optional[DialectOrStr] = None,
|
|
1561
1714
|
) -> SqlParsingResult:
|
|
1562
1715
|
schema_resolver = create_schema_resolver(
|
|
1563
1716
|
platform=platform,
|
|
@@ -1577,6 +1730,7 @@ def create_lineage_sql_parsed_result(
|
|
|
1577
1730
|
schema_resolver=schema_resolver,
|
|
1578
1731
|
default_db=default_db,
|
|
1579
1732
|
default_schema=default_schema,
|
|
1733
|
+
override_dialect=override_dialect,
|
|
1580
1734
|
)
|
|
1581
1735
|
except Exception as e:
|
|
1582
1736
|
return SqlParsingResult.make_from_error(e)
|
|
@@ -40,9 +40,6 @@ def _get_dialect_str(platform: str) -> str:
|
|
|
40
40
|
# let the fuzzy resolution logic handle it.
|
|
41
41
|
# MariaDB is a fork of MySQL, so we reuse the same dialect.
|
|
42
42
|
return "mysql, normalization_strategy = lowercase"
|
|
43
|
-
# Dremio is based upon drill. Not 100% compatibility
|
|
44
|
-
elif platform == "dremio":
|
|
45
|
-
return "drill"
|
|
46
43
|
else:
|
|
47
44
|
return platform
|
|
48
45
|
|
|
@@ -115,6 +112,8 @@ def _expression_to_string(
|
|
|
115
112
|
return expression.sql(dialect=get_dialect(platform))
|
|
116
113
|
|
|
117
114
|
|
|
115
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
|
|
116
|
+
|
|
118
117
|
_BASIC_NORMALIZATION_RULES = {
|
|
119
118
|
# Remove /* */ comments.
|
|
120
119
|
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
|
@@ -130,7 +129,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
130
129
|
re.compile(r"'[^']*'"): "?",
|
|
131
130
|
# Replace sequences of IN/VALUES with a single placeholder.
|
|
132
131
|
# The r" ?" makes it more robust to uneven spacing.
|
|
133
|
-
re.compile(
|
|
132
|
+
re.compile(
|
|
133
|
+
r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
|
|
134
|
+
): r"\1 (?)",
|
|
134
135
|
# Normalize parenthesis spacing.
|
|
135
136
|
re.compile(r"\( "): "(",
|
|
136
137
|
re.compile(r" \)"): ")",
|
|
@@ -139,6 +140,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
139
140
|
# e.g. "col1,col2" -> "col1, col2"
|
|
140
141
|
re.compile(r"\b ,"): ",",
|
|
141
142
|
re.compile(r"\b,\b"): ", ",
|
|
143
|
+
# MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
|
|
144
|
+
# Replace all versions of placeholders with generic ? placeholder.
|
|
145
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
|
|
142
146
|
}
|
|
143
147
|
_TABLE_NAME_NORMALIZATION_RULES = {
|
|
144
148
|
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
|
@@ -262,6 +266,10 @@ def get_query_fingerprint_debug(
|
|
|
262
266
|
if not fast:
|
|
263
267
|
dialect = get_dialect(platform)
|
|
264
268
|
expression_sql = generalize_query(expression, dialect=dialect)
|
|
269
|
+
# Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
|
|
270
|
+
expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
|
|
271
|
+
"?", expression_sql
|
|
272
|
+
)
|
|
265
273
|
else:
|
|
266
274
|
expression_sql = generalize_query_fast(expression, dialect=platform)
|
|
267
275
|
except (ValueError, sqlglot.errors.SqlglotError) as e:
|
|
@@ -208,9 +208,7 @@ class ToolMetaExtractor:
|
|
|
208
208
|
Returns:
|
|
209
209
|
bool: whether QueryLog entry is that of hex.
|
|
210
210
|
"""
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
if not last_line.startswith("-- Hex query metadata:"):
|
|
211
|
+
if "-- Hex query metadata:" not in entry.query_text:
|
|
214
212
|
return False
|
|
215
213
|
|
|
216
214
|
entry.origin = HEX_PLATFORM_URN
|
datahub/telemetry/telemetry.py
CHANGED
|
@@ -16,6 +16,11 @@ from datahub._version import __version__, nice_version_name
|
|
|
16
16
|
from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
|
|
17
17
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
18
18
|
from datahub.configuration.common import ExceptionWithProps
|
|
19
|
+
from datahub.configuration.env_vars import (
|
|
20
|
+
get_sentry_dsn,
|
|
21
|
+
get_sentry_environment,
|
|
22
|
+
get_telemetry_timeout,
|
|
23
|
+
)
|
|
19
24
|
from datahub.metadata.schema_classes import _custom_package_path
|
|
20
25
|
from datahub.utilities.perf_timer import PerfTimer
|
|
21
26
|
|
|
@@ -97,14 +102,14 @@ if any(var in os.environ for var in CI_ENV_VARS):
|
|
|
97
102
|
if _custom_package_path:
|
|
98
103
|
ENV_ENABLED = False
|
|
99
104
|
|
|
100
|
-
TIMEOUT = int(
|
|
105
|
+
TIMEOUT = int(get_telemetry_timeout())
|
|
101
106
|
MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
|
|
102
107
|
MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
|
|
103
|
-
SENTRY_DSN: Optional[str] =
|
|
104
|
-
SENTRY_ENVIRONMENT: str =
|
|
108
|
+
SENTRY_DSN: Optional[str] = get_sentry_dsn()
|
|
109
|
+
SENTRY_ENVIRONMENT: str = get_sentry_environment()
|
|
105
110
|
|
|
106
111
|
|
|
107
|
-
def
|
|
112
|
+
def _default_global_properties() -> Dict[str, Any]:
|
|
108
113
|
return {
|
|
109
114
|
"datahub_version": nice_version_name(),
|
|
110
115
|
"python_version": platform.python_version(),
|
|
@@ -122,6 +127,7 @@ class Telemetry:
|
|
|
122
127
|
context_properties: Dict[str, Any] = {}
|
|
123
128
|
|
|
124
129
|
def __init__(self):
|
|
130
|
+
self.global_properties = _default_global_properties()
|
|
125
131
|
self.context_properties = {}
|
|
126
132
|
|
|
127
133
|
if SENTRY_DSN:
|
|
@@ -247,6 +253,10 @@ class Telemetry:
|
|
|
247
253
|
|
|
248
254
|
return False
|
|
249
255
|
|
|
256
|
+
def add_global_property(self, key: str, value: Any) -> None:
|
|
257
|
+
self.global_properties[key] = value
|
|
258
|
+
self._update_sentry_properties()
|
|
259
|
+
|
|
250
260
|
def set_context(
|
|
251
261
|
self,
|
|
252
262
|
server: Optional["DataHubGraph"] = None,
|
|
@@ -257,16 +267,20 @@ class Telemetry:
|
|
|
257
267
|
**(properties or {}),
|
|
258
268
|
}
|
|
259
269
|
|
|
260
|
-
|
|
261
|
-
from sentry_sdk import set_tag
|
|
270
|
+
self._update_sentry_properties()
|
|
262
271
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
272
|
+
def _update_sentry_properties(self) -> None:
|
|
273
|
+
properties = {
|
|
274
|
+
**self.global_properties,
|
|
275
|
+
**self.context_properties,
|
|
276
|
+
}
|
|
277
|
+
if self.sentry_enabled:
|
|
278
|
+
import sentry_sdk
|
|
267
279
|
|
|
268
|
-
|
|
269
|
-
|
|
280
|
+
# Note: once we're on sentry-sdk 2.1.0+, we can use sentry_sdk.set_tags(properties)
|
|
281
|
+
# See https://github.com/getsentry/sentry-python/commit/6c960d752c7c7aff3fd7469d2e9ad98f19663aa8
|
|
282
|
+
for key, value in properties.items():
|
|
283
|
+
sentry_sdk.set_tag(key, value)
|
|
270
284
|
|
|
271
285
|
def init_capture_exception(self) -> None:
|
|
272
286
|
if self.sentry_enabled:
|
|
@@ -300,7 +314,7 @@ class Telemetry:
|
|
|
300
314
|
try:
|
|
301
315
|
self.mp.people_set(
|
|
302
316
|
self.client_id,
|
|
303
|
-
|
|
317
|
+
self.global_properties,
|
|
304
318
|
)
|
|
305
319
|
except Exception as e:
|
|
306
320
|
logger.debug(f"Error initializing telemetry: {e}")
|
|
@@ -334,7 +348,7 @@ class Telemetry:
|
|
|
334
348
|
logger.debug(f"Sending telemetry for {event_name}")
|
|
335
349
|
|
|
336
350
|
properties = {
|
|
337
|
-
**
|
|
351
|
+
**self.global_properties,
|
|
338
352
|
**self.context_properties,
|
|
339
353
|
**properties,
|
|
340
354
|
}
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
import pathlib
|
|
2
|
+
from typing import Sequence
|
|
2
3
|
|
|
3
4
|
from datahub.sdk.entity import Entity
|
|
4
5
|
from datahub.testing import mce_helpers
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
def assert_entity_golden(
|
|
8
|
+
def assert_entity_golden(
|
|
9
|
+
entity: Entity,
|
|
10
|
+
golden_path: pathlib.Path,
|
|
11
|
+
ignore_paths: Sequence[str] = (),
|
|
12
|
+
) -> None:
|
|
8
13
|
mce_helpers.check_goldens_stream(
|
|
9
14
|
outputs=entity.as_mcps(),
|
|
10
15
|
golden_path=golden_path,
|
|
11
16
|
ignore_order=False,
|
|
17
|
+
ignore_paths=ignore_paths,
|
|
12
18
|
)
|