acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -13,13 +13,15 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
|
13
13
|
import pydantic
|
|
14
14
|
from typing_extensions import Self
|
|
15
15
|
|
|
16
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
17
17
|
from datahub.configuration.time_window_config import (
|
|
18
18
|
BaseTimeWindowConfig,
|
|
19
19
|
BucketDuration,
|
|
20
|
+
get_time_bucket,
|
|
20
21
|
)
|
|
21
22
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
23
|
from datahub.ingestion.api.common import PipelineContext
|
|
24
|
+
from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status
|
|
23
25
|
from datahub.ingestion.api.report import Report
|
|
24
26
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
25
27
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
@@ -28,6 +30,7 @@ from datahub.ingestion.graph.client import DataHubGraph
|
|
|
28
30
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
29
31
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
30
32
|
DEFAULT_TEMP_TABLES_PATTERNS,
|
|
33
|
+
QueryDedupStrategyType,
|
|
31
34
|
SnowflakeFilterConfig,
|
|
32
35
|
SnowflakeIdentifierConfig,
|
|
33
36
|
)
|
|
@@ -44,6 +47,14 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
44
47
|
SnowflakeIdentifierBuilder,
|
|
45
48
|
SnowflakeStructuredReportMixin,
|
|
46
49
|
)
|
|
50
|
+
from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
51
|
+
StoredProcCall,
|
|
52
|
+
StoredProcLineageReport,
|
|
53
|
+
StoredProcLineageTracker,
|
|
54
|
+
)
|
|
55
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
56
|
+
RedundantQueriesRunSkipHandler,
|
|
57
|
+
)
|
|
47
58
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
48
59
|
from datahub.metadata.urns import CorpUserUrn
|
|
49
60
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -63,7 +74,10 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
63
74
|
DownstreamColumnRef,
|
|
64
75
|
)
|
|
65
76
|
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
66
|
-
from datahub.utilities.file_backed_collections import
|
|
77
|
+
from datahub.utilities.file_backed_collections import (
|
|
78
|
+
ConnectionWrapper,
|
|
79
|
+
FileBackedList,
|
|
80
|
+
)
|
|
67
81
|
from datahub.utilities.perf_timer import PerfTimer
|
|
68
82
|
|
|
69
83
|
logger = logging.getLogger(__name__)
|
|
@@ -80,10 +94,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
80
94
|
|
|
81
95
|
pushdown_deny_usernames: List[str] = pydantic.Field(
|
|
82
96
|
default=[],
|
|
83
|
-
description="List of snowflake usernames which will
|
|
97
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
|
|
84
98
|
"This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
|
|
85
99
|
)
|
|
86
100
|
|
|
101
|
+
pushdown_allow_usernames: List[str] = pydantic.Field(
|
|
102
|
+
default=[],
|
|
103
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
|
|
104
|
+
"This is primarily useful for improving performance by filtering in only specific users. "
|
|
105
|
+
"If not specified, all users not in deny list are included.",
|
|
106
|
+
)
|
|
107
|
+
|
|
87
108
|
user_email_pattern: AllowDenyPattern = pydantic.Field(
|
|
88
109
|
default=AllowDenyPattern.allow_all(),
|
|
89
110
|
description="Regex patterns for user emails to filter in usage.",
|
|
@@ -96,12 +117,11 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
96
117
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
97
118
|
)
|
|
98
119
|
|
|
99
|
-
local_temp_path: Optional[pathlib.Path] = pydantic.Field(
|
|
100
|
-
default=None,
|
|
101
|
-
description="Local path to store the audit log.",
|
|
120
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = pydantic.Field(
|
|
102
121
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
103
122
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
104
|
-
|
|
123
|
+
default=None,
|
|
124
|
+
description="Local path to store the audit log.",
|
|
105
125
|
)
|
|
106
126
|
|
|
107
127
|
include_lineage: bool = True
|
|
@@ -110,6 +130,22 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
110
130
|
include_query_usage_statistics: bool = True
|
|
111
131
|
include_operations: bool = True
|
|
112
132
|
|
|
133
|
+
push_down_database_pattern_access_history: bool = pydantic.Field(
|
|
134
|
+
default=False,
|
|
135
|
+
description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
|
|
136
|
+
"This filters on the accessed objects in access_history.",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
additional_database_names_allowlist: List[str] = pydantic.Field(
|
|
140
|
+
default=[],
|
|
141
|
+
description="Additional database names (no pattern matching) to be included in the access_history filter. "
|
|
142
|
+
"Only applies if push_down_database_pattern_access_history=True. "
|
|
143
|
+
"These databases will be included in the filter being pushed down regardless of database_pattern settings."
|
|
144
|
+
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
|
|
148
|
+
|
|
113
149
|
|
|
114
150
|
class SnowflakeQueriesSourceConfig(
|
|
115
151
|
SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
|
|
@@ -124,7 +160,10 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
124
160
|
users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
125
161
|
|
|
126
162
|
audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
163
|
+
aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
164
|
+
|
|
127
165
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
166
|
+
stored_proc_lineage: Optional[StoredProcLineageReport] = None
|
|
128
167
|
|
|
129
168
|
num_ddl_queries_dropped: int = 0
|
|
130
169
|
num_stream_queries_observed: int = 0
|
|
@@ -146,6 +185,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
146
185
|
structured_report: SourceReport,
|
|
147
186
|
filters: SnowflakeFilter,
|
|
148
187
|
identifiers: SnowflakeIdentifierBuilder,
|
|
188
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
149
189
|
graph: Optional[DataHubGraph] = None,
|
|
150
190
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
151
191
|
discovered_tables: Optional[List[str]] = None,
|
|
@@ -157,9 +197,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
157
197
|
self.filters = filters
|
|
158
198
|
self.identifiers = identifiers
|
|
159
199
|
self.discovered_tables = set(discovered_tables) if discovered_tables else None
|
|
200
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
160
201
|
|
|
161
202
|
self._structured_report = structured_report
|
|
162
203
|
|
|
204
|
+
# Adjust time window based on stateful ingestion state
|
|
205
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
206
|
+
|
|
163
207
|
# The exit stack helps ensure that we close all the resources we open.
|
|
164
208
|
self._exit_stack = contextlib.ExitStack()
|
|
165
209
|
|
|
@@ -177,8 +221,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
177
221
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
178
222
|
usage_config=BaseUsageConfig(
|
|
179
223
|
bucket_duration=self.config.window.bucket_duration,
|
|
180
|
-
start_time=self.
|
|
181
|
-
end_time=self.
|
|
224
|
+
start_time=self.start_time,
|
|
225
|
+
end_time=self.end_time,
|
|
182
226
|
user_email_pattern=self.config.user_email_pattern,
|
|
183
227
|
# TODO make the rest of the fields configurable
|
|
184
228
|
),
|
|
@@ -194,6 +238,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
194
238
|
def structured_reporter(self) -> SourceReport:
|
|
195
239
|
return self._structured_report
|
|
196
240
|
|
|
241
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
242
|
+
if self.redundant_run_skip_handler:
|
|
243
|
+
start_time, end_time = (
|
|
244
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
245
|
+
self.config.window.start_time,
|
|
246
|
+
self.config.window.end_time,
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
start_time = self.config.window.start_time
|
|
251
|
+
end_time = self.config.window.end_time
|
|
252
|
+
|
|
253
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
254
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
255
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
256
|
+
if self.config.include_usage_statistics:
|
|
257
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
258
|
+
|
|
259
|
+
return start_time, end_time
|
|
260
|
+
|
|
261
|
+
def _update_state(self) -> None:
|
|
262
|
+
if self.redundant_run_skip_handler:
|
|
263
|
+
self.redundant_run_skip_handler.update_state(
|
|
264
|
+
self.config.window.start_time,
|
|
265
|
+
self.config.window.end_time,
|
|
266
|
+
self.config.window.bucket_duration,
|
|
267
|
+
)
|
|
268
|
+
|
|
197
269
|
@functools.cached_property
|
|
198
270
|
def local_temp_path(self) -> pathlib.Path:
|
|
199
271
|
if self.config.local_temp_path:
|
|
@@ -243,6 +315,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
243
315
|
audit_log_file = self.local_temp_path / "audit_log.sqlite"
|
|
244
316
|
use_cached_audit_log = audit_log_file.exists()
|
|
245
317
|
|
|
318
|
+
if self.config.local_temp_path is None:
|
|
319
|
+
self._exit_stack.callback(lambda: audit_log_file.unlink(missing_ok=True))
|
|
320
|
+
|
|
321
|
+
shared_connection = self._exit_stack.enter_context(
|
|
322
|
+
ConnectionWrapper(audit_log_file)
|
|
323
|
+
)
|
|
246
324
|
queries: FileBackedList[
|
|
247
325
|
Union[
|
|
248
326
|
KnownLineageMapping,
|
|
@@ -250,44 +328,73 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
250
328
|
TableRename,
|
|
251
329
|
TableSwap,
|
|
252
330
|
ObservedQuery,
|
|
331
|
+
StoredProcCall,
|
|
253
332
|
]
|
|
254
|
-
]
|
|
333
|
+
] = self._exit_stack.enter_context(FileBackedList(shared_connection))
|
|
334
|
+
|
|
255
335
|
if use_cached_audit_log:
|
|
256
|
-
logger.info("Using cached audit log")
|
|
257
|
-
shared_connection = ConnectionWrapper(audit_log_file)
|
|
258
|
-
queries = FileBackedList(shared_connection)
|
|
336
|
+
logger.info(f"Using cached audit log at {audit_log_file}")
|
|
259
337
|
else:
|
|
260
|
-
|
|
338
|
+
# Check if any query-based features are enabled before fetching
|
|
339
|
+
needs_query_data = any(
|
|
340
|
+
[
|
|
341
|
+
self.config.include_lineage,
|
|
342
|
+
self.config.include_queries,
|
|
343
|
+
self.config.include_usage_statistics,
|
|
344
|
+
self.config.include_query_usage_statistics,
|
|
345
|
+
self.config.include_operations,
|
|
346
|
+
]
|
|
347
|
+
)
|
|
261
348
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
TableSwap,
|
|
269
|
-
ObservedQuery,
|
|
270
|
-
]
|
|
349
|
+
if not needs_query_data:
|
|
350
|
+
logger.info(
|
|
351
|
+
"All query-based features are disabled. Skipping expensive query log fetch."
|
|
352
|
+
)
|
|
353
|
+
else:
|
|
354
|
+
logger.info(f"Fetching audit log into {audit_log_file}")
|
|
271
355
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
356
|
+
with self.report.copy_history_fetch_timer:
|
|
357
|
+
for copy_entry in self.fetch_copy_history():
|
|
358
|
+
queries.append(copy_entry)
|
|
275
359
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
360
|
+
with self.report.query_log_fetch_timer:
|
|
361
|
+
for entry in self.fetch_query_log(users):
|
|
362
|
+
queries.append(entry)
|
|
363
|
+
|
|
364
|
+
stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
|
|
365
|
+
StoredProcLineageTracker(
|
|
366
|
+
platform=self.identifiers.platform,
|
|
367
|
+
shared_connection=shared_connection,
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
self.report.stored_proc_lineage = stored_proc_tracker.report
|
|
279
371
|
|
|
280
372
|
with self.report.audit_log_load_timer:
|
|
281
373
|
for i, query in enumerate(queries):
|
|
282
374
|
if i % 1000 == 0:
|
|
283
375
|
logger.info(f"Added {i} query log entries to SQL aggregator")
|
|
284
|
-
self.aggregator.add(query)
|
|
285
376
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
377
|
+
if isinstance(query, StoredProcCall):
|
|
378
|
+
stored_proc_tracker.add_stored_proc_call(query)
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
if not (
|
|
382
|
+
isinstance(query, PreparsedQuery)
|
|
383
|
+
and stored_proc_tracker.add_related_query(query)
|
|
384
|
+
):
|
|
385
|
+
# Only add to aggregator if it's not part of a stored procedure.
|
|
386
|
+
self.aggregator.add(query)
|
|
387
|
+
|
|
388
|
+
# Generate and add stored procedure lineage entries.
|
|
389
|
+
for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
|
|
390
|
+
# TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
|
|
391
|
+
self.aggregator.add(lineage_entry)
|
|
392
|
+
|
|
393
|
+
with self.report.aggregator_generate_timer:
|
|
394
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
395
|
+
|
|
396
|
+
# Update the stateful ingestion state after successful extraction
|
|
397
|
+
self._update_state()
|
|
291
398
|
|
|
292
399
|
def fetch_users(self) -> UsersMapping:
|
|
293
400
|
users: UsersMapping = dict()
|
|
@@ -312,8 +419,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
312
419
|
# Derived from _populate_external_lineage_from_copy_history.
|
|
313
420
|
|
|
314
421
|
query: str = SnowflakeQuery.copy_lineage_history(
|
|
315
|
-
start_time_millis=int(self.
|
|
316
|
-
end_time_millis=int(self.
|
|
422
|
+
start_time_millis=int(self.start_time.timestamp() * 1000),
|
|
423
|
+
end_time_millis=int(self.end_time.timestamp() * 1000),
|
|
317
424
|
downstreams_deny_pattern=self.config.temporary_tables_pattern,
|
|
318
425
|
)
|
|
319
426
|
|
|
@@ -344,13 +451,23 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
344
451
|
|
|
345
452
|
def fetch_query_log(
|
|
346
453
|
self, users: UsersMapping
|
|
347
|
-
) -> Iterable[
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
454
|
+
) -> Iterable[
|
|
455
|
+
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
456
|
+
]:
|
|
457
|
+
query_log_query = QueryLogQueryBuilder(
|
|
458
|
+
start_time=self.start_time,
|
|
459
|
+
end_time=self.end_time,
|
|
351
460
|
bucket_duration=self.config.window.bucket_duration,
|
|
352
461
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
353
|
-
|
|
462
|
+
allow_usernames=self.config.pushdown_allow_usernames,
|
|
463
|
+
dedup_strategy=self.config.query_dedup_strategy,
|
|
464
|
+
database_pattern=self.filters.filter_config.database_pattern
|
|
465
|
+
if self.config.push_down_database_pattern_access_history
|
|
466
|
+
else None,
|
|
467
|
+
additional_database_names=self.config.additional_database_names_allowlist
|
|
468
|
+
if self.config.push_down_database_pattern_access_history
|
|
469
|
+
else None,
|
|
470
|
+
).build_enriched_query_log_query()
|
|
354
471
|
|
|
355
472
|
with self.structured_reporter.report_exc(
|
|
356
473
|
"Error fetching query log from Snowflake"
|
|
@@ -384,7 +501,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
384
501
|
|
|
385
502
|
def _parse_audit_log_row(
|
|
386
503
|
self, row: Dict[str, Any], users: UsersMapping
|
|
387
|
-
) -> Optional[
|
|
504
|
+
) -> Optional[
|
|
505
|
+
Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
|
|
506
|
+
]:
|
|
388
507
|
json_fields = {
|
|
389
508
|
"DIRECT_OBJECTS_ACCESSED",
|
|
390
509
|
"OBJECTS_MODIFIED",
|
|
@@ -403,8 +522,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
403
522
|
|
|
404
523
|
# TODO need to map snowflake query types to ours
|
|
405
524
|
query_text: str = res["query_text"]
|
|
525
|
+
snowflake_query_type: str = res["query_type"]
|
|
406
526
|
query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
407
|
-
|
|
527
|
+
snowflake_query_type, QueryType.UNKNOWN
|
|
408
528
|
)
|
|
409
529
|
|
|
410
530
|
direct_objects_accessed = res["direct_objects_accessed"]
|
|
@@ -421,7 +541,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
421
541
|
res["session_id"],
|
|
422
542
|
timestamp,
|
|
423
543
|
object_modified_by_ddl,
|
|
424
|
-
|
|
544
|
+
snowflake_query_type,
|
|
425
545
|
)
|
|
426
546
|
if known_ddl_entry:
|
|
427
547
|
return known_ddl_entry
|
|
@@ -436,6 +556,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
436
556
|
res["user_name"], users.get(res["user_name"])
|
|
437
557
|
)
|
|
438
558
|
)
|
|
559
|
+
extra_info = {
|
|
560
|
+
"snowflake_query_id": res["query_id"],
|
|
561
|
+
"snowflake_root_query_id": res["root_query_id"],
|
|
562
|
+
"snowflake_query_type": res["query_type"],
|
|
563
|
+
"snowflake_role_name": res["role_name"],
|
|
564
|
+
"query_duration": res["query_duration"],
|
|
565
|
+
"rows_inserted": res["rows_inserted"],
|
|
566
|
+
"rows_updated": res["rows_updated"],
|
|
567
|
+
"rows_deleted": res["rows_deleted"],
|
|
568
|
+
}
|
|
439
569
|
|
|
440
570
|
# There are a couple cases when we'd want to prefer our own SQL parsing
|
|
441
571
|
# over Snowflake's metadata.
|
|
@@ -470,6 +600,18 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
470
600
|
query_hash=get_query_fingerprint(
|
|
471
601
|
query_text, self.identifiers.platform, fast=True
|
|
472
602
|
),
|
|
603
|
+
extra_info=extra_info,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if snowflake_query_type == "CALL" and res["root_query_id"] is None:
|
|
607
|
+
return StoredProcCall(
|
|
608
|
+
# This is the top-level query ID that other entries will reference.
|
|
609
|
+
snowflake_root_query_id=res["query_id"],
|
|
610
|
+
query_text=query_text,
|
|
611
|
+
timestamp=timestamp,
|
|
612
|
+
user=user,
|
|
613
|
+
default_db=res["default_db"],
|
|
614
|
+
default_schema=res["default_schema"],
|
|
473
615
|
)
|
|
474
616
|
|
|
475
617
|
upstreams = []
|
|
@@ -556,6 +698,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
556
698
|
timestamp=timestamp,
|
|
557
699
|
session_id=res["session_id"],
|
|
558
700
|
query_type=query_type,
|
|
701
|
+
extra_info=extra_info,
|
|
559
702
|
)
|
|
560
703
|
return entry
|
|
561
704
|
|
|
@@ -608,6 +751,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
608
751
|
self._exit_stack.close()
|
|
609
752
|
|
|
610
753
|
|
|
754
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
755
|
+
@config_class(SnowflakeQueriesSourceConfig)
|
|
611
756
|
class SnowflakeQueriesSource(Source):
|
|
612
757
|
def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
|
|
613
758
|
self.ctx = ctx
|
|
@@ -652,69 +797,288 @@ class SnowflakeQueriesSource(Source):
|
|
|
652
797
|
def close(self) -> None:
|
|
653
798
|
self.connection.close()
|
|
654
799
|
self.queries_extractor.close()
|
|
800
|
+
super().close()
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
class QueryLogQueryBuilder:
|
|
804
|
+
def __init__(
|
|
805
|
+
self,
|
|
806
|
+
start_time: datetime,
|
|
807
|
+
end_time: datetime,
|
|
808
|
+
bucket_duration: BucketDuration,
|
|
809
|
+
deny_usernames: Optional[List[str]] = None,
|
|
810
|
+
allow_usernames: Optional[List[str]] = None,
|
|
811
|
+
max_tables_per_query: int = 20,
|
|
812
|
+
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
813
|
+
database_pattern: Optional[AllowDenyPattern] = None,
|
|
814
|
+
additional_database_names: Optional[List[str]] = None,
|
|
815
|
+
):
|
|
816
|
+
self.start_time = start_time
|
|
817
|
+
self.end_time = end_time
|
|
818
|
+
self.start_time_millis = int(start_time.timestamp() * 1000)
|
|
819
|
+
self.end_time_millis = int(end_time.timestamp() * 1000)
|
|
820
|
+
self.max_tables_per_query = max_tables_per_query
|
|
821
|
+
self.dedup_strategy = dedup_strategy
|
|
822
|
+
|
|
823
|
+
self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
|
|
824
|
+
|
|
825
|
+
self.access_history_database_filter = (
|
|
826
|
+
self._build_access_history_database_filter_condition(
|
|
827
|
+
database_pattern, additional_database_names
|
|
828
|
+
)
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
self.time_bucket_size = bucket_duration.value
|
|
832
|
+
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
833
|
+
|
|
834
|
+
def _build_user_filter(
|
|
835
|
+
self,
|
|
836
|
+
deny_usernames: Optional[List[str]] = None,
|
|
837
|
+
allow_usernames: Optional[List[str]] = None,
|
|
838
|
+
) -> str:
|
|
839
|
+
"""
|
|
840
|
+
Build user filter SQL condition based on deny and allow username patterns.
|
|
841
|
+
|
|
842
|
+
Args:
|
|
843
|
+
deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
|
|
844
|
+
allow_usernames: List of username patterns to include (SQL LIKE patterns)
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
SQL WHERE condition string for filtering users
|
|
848
|
+
"""
|
|
849
|
+
user_filters = []
|
|
850
|
+
|
|
851
|
+
if deny_usernames:
|
|
852
|
+
deny_conditions = []
|
|
853
|
+
for pattern in deny_usernames:
|
|
854
|
+
# Escape single quotes for SQL safety
|
|
855
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
856
|
+
deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
|
|
857
|
+
if deny_conditions:
|
|
858
|
+
user_filters.append(f"({' AND '.join(deny_conditions)})")
|
|
859
|
+
|
|
860
|
+
if allow_usernames:
|
|
861
|
+
allow_conditions = []
|
|
862
|
+
for pattern in allow_usernames:
|
|
863
|
+
# Escape single quotes for SQL safety
|
|
864
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
865
|
+
allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
|
|
866
|
+
if allow_conditions:
|
|
867
|
+
user_filters.append(f"({' OR '.join(allow_conditions)})")
|
|
868
|
+
|
|
869
|
+
return " AND ".join(user_filters) if user_filters else "TRUE"
|
|
870
|
+
|
|
871
|
+
def _build_access_history_database_filter_condition(
|
|
872
|
+
self,
|
|
873
|
+
database_pattern: Optional[AllowDenyPattern],
|
|
874
|
+
additional_database_names: Optional[List[str]] = None,
|
|
875
|
+
) -> str:
|
|
876
|
+
"""
|
|
877
|
+
Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
|
|
878
|
+
|
|
879
|
+
IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
|
|
880
|
+
access_history table:
|
|
881
|
+
|
|
882
|
+
- DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
|
|
883
|
+
`direct_objects_accessed` and `objects_modified` arrays
|
|
884
|
+
- DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
|
|
885
|
+
`object_modified_by_ddl` field (single object, not an array)
|
|
886
|
+
|
|
887
|
+
Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
|
|
888
|
+
would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
|
|
889
|
+
and operational metadata.
|
|
890
|
+
|
|
891
|
+
Filtering Logic:
|
|
892
|
+
A query is included if it matches:
|
|
893
|
+
- Any database name in additional_database_names (exact match), OR
|
|
894
|
+
- Any database pattern in database_pattern.allow AND NOT any pattern in database_pattern.deny
|
|
895
|
+
|
|
896
|
+
Args:
|
|
897
|
+
database_pattern: The AllowDenyPattern configuration for database filtering
|
|
898
|
+
additional_database_names: Additional database names to always include (no pattern matching)
|
|
899
|
+
|
|
900
|
+
Returns:
|
|
901
|
+
A SQL WHERE condition string, or "TRUE" if no filtering should be applied
|
|
902
|
+
"""
|
|
903
|
+
if not database_pattern and not additional_database_names:
|
|
904
|
+
return "TRUE"
|
|
905
|
+
|
|
906
|
+
# Build the database filter conditions
|
|
907
|
+
# Logic: Allow if (matches additional_database_names_allowlist) OR (matches database_pattern.allow AND NOT matches database_pattern.deny)
|
|
908
|
+
# Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
|
|
909
|
+
|
|
910
|
+
# Build additional database names condition (exact matches) - these always get included
|
|
911
|
+
additional_db_condition = None
|
|
912
|
+
if additional_database_names:
|
|
913
|
+
additional_db_conditions = []
|
|
914
|
+
for db_name in additional_database_names:
|
|
915
|
+
# Escape single quotes
|
|
916
|
+
escaped_db_name = db_name.replace("'", "''")
|
|
917
|
+
additional_db_conditions.append(
|
|
918
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
|
|
919
|
+
)
|
|
920
|
+
if additional_db_conditions:
|
|
921
|
+
additional_db_condition = " OR ".join(additional_db_conditions)
|
|
922
|
+
|
|
923
|
+
# Build database pattern condition (allow AND NOT deny)
|
|
924
|
+
database_pattern_condition = None
|
|
925
|
+
if database_pattern:
|
|
926
|
+
allow_patterns = database_pattern.allow
|
|
927
|
+
deny_patterns = database_pattern.deny
|
|
928
|
+
|
|
929
|
+
pattern_parts = []
|
|
930
|
+
|
|
931
|
+
# Add allow patterns (if not the default "allow all")
|
|
932
|
+
if allow_patterns and allow_patterns != [".*"]:
|
|
933
|
+
allow_conditions = []
|
|
934
|
+
for pattern in allow_patterns:
|
|
935
|
+
# Escape single quotes that might be present in the regex pattern
|
|
936
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
937
|
+
allow_conditions.append(
|
|
938
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
|
|
939
|
+
)
|
|
940
|
+
if allow_conditions:
|
|
941
|
+
pattern_parts.append(
|
|
942
|
+
allow_conditions[0]
|
|
943
|
+
if len(allow_conditions) == 1
|
|
944
|
+
else f"({' OR '.join(allow_conditions)})"
|
|
945
|
+
)
|
|
655
946
|
|
|
947
|
+
# Add deny patterns
|
|
948
|
+
if deny_patterns:
|
|
949
|
+
deny_conditions = []
|
|
950
|
+
for pattern in deny_patterns:
|
|
951
|
+
# Escape single quotes that might be present in the regex pattern
|
|
952
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
953
|
+
deny_conditions.append(
|
|
954
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
|
|
955
|
+
)
|
|
956
|
+
if deny_conditions:
|
|
957
|
+
pattern_parts.append(
|
|
958
|
+
deny_conditions[0]
|
|
959
|
+
if len(deny_conditions) == 1
|
|
960
|
+
else f"({' AND '.join(deny_conditions)})"
|
|
961
|
+
)
|
|
656
962
|
|
|
657
|
-
|
|
658
|
-
|
|
963
|
+
if pattern_parts:
|
|
964
|
+
database_pattern_condition = " AND ".join(pattern_parts)
|
|
659
965
|
|
|
966
|
+
# Combine conditions: additional_database_names OR database_pattern
|
|
967
|
+
filter_conditions = []
|
|
968
|
+
if additional_db_condition:
|
|
969
|
+
filter_conditions.append(
|
|
970
|
+
f"({additional_db_condition})"
|
|
971
|
+
if len(additional_db_condition.split(" OR ")) > 1
|
|
972
|
+
else additional_db_condition
|
|
973
|
+
)
|
|
974
|
+
if database_pattern_condition:
|
|
975
|
+
filter_conditions.append(
|
|
976
|
+
f"({database_pattern_condition})"
|
|
977
|
+
if len(database_pattern_condition.split(" AND ")) > 1
|
|
978
|
+
else database_pattern_condition
|
|
979
|
+
)
|
|
660
980
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
)
|
|
667
|
-
start_time_millis = int(start_time.timestamp() * 1000)
|
|
668
|
-
end_time_millis = int(end_time.timestamp() * 1000)
|
|
981
|
+
if filter_conditions:
|
|
982
|
+
database_filter_condition = (
|
|
983
|
+
filter_conditions[0]
|
|
984
|
+
if len(filter_conditions) == 1
|
|
985
|
+
else " OR ".join(filter_conditions)
|
|
986
|
+
)
|
|
669
987
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
988
|
+
# Build a condition that checks if any objects in the arrays match the database pattern
|
|
989
|
+
# This implements "at least one" matching behavior: queries are allowed if they touch
|
|
990
|
+
# at least one database that matches the pattern, even if they also touch other databases
|
|
991
|
+
# Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
|
|
992
|
+
direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
|
|
993
|
+
objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
|
|
994
|
+
|
|
995
|
+
# CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
|
|
996
|
+
# DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
|
|
997
|
+
# We need to adapt the filter condition for a single object rather than an array
|
|
998
|
+
ddl_filter_condition = database_filter_condition.replace(
|
|
999
|
+
"o:objectName", "object_modified_by_ddl:objectName"
|
|
1000
|
+
)
|
|
1001
|
+
object_modified_by_ddl_condition = f"({ddl_filter_condition})"
|
|
674
1002
|
|
|
675
|
-
|
|
676
|
-
|
|
1003
|
+
return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
|
|
1004
|
+
else:
|
|
1005
|
+
return "TRUE"
|
|
1006
|
+
|
|
1007
|
+
def _query_fingerprinted_queries(self):
|
|
1008
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
1009
|
+
secondary_fingerprint_sql = """
|
|
1010
|
+
CASE
|
|
1011
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
1012
|
+
-- Extract project id and hash it
|
|
1013
|
+
THEN CAST(HASH(
|
|
1014
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
1015
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
1016
|
+
) AS VARCHAR)
|
|
1017
|
+
ELSE NULL
|
|
1018
|
+
END"""
|
|
1019
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
1020
|
+
secondary_fingerprint_sql = "NULL"
|
|
1021
|
+
else:
|
|
1022
|
+
raise NotImplementedError(
|
|
1023
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
1024
|
+
)
|
|
1025
|
+
return f"""
|
|
1026
|
+
SELECT *,
|
|
1027
|
+
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
1028
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
1029
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
1030
|
+
{secondary_fingerprint_sql} as query_secondary_fingerprint
|
|
1031
|
+
FROM
|
|
1032
|
+
snowflake.account_usage.query_history
|
|
1033
|
+
WHERE
|
|
1034
|
+
query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
1035
|
+
AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
1036
|
+
AND execution_status = 'SUCCESS'
|
|
1037
|
+
AND {self.users_filter}"""
|
|
1038
|
+
|
|
1039
|
+
def _query_deduplicated_queries(self):
|
|
1040
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
1041
|
+
return f"""
|
|
1042
|
+
SELECT
|
|
1043
|
+
*,
|
|
1044
|
+
DATE_TRUNC(
|
|
1045
|
+
{self.time_bucket_size},
|
|
1046
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
1047
|
+
) AS bucket_start_time,
|
|
1048
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
1049
|
+
FROM
|
|
1050
|
+
fingerprinted_queries
|
|
1051
|
+
QUALIFY
|
|
1052
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
|
|
1053
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
1054
|
+
return f"""
|
|
1055
|
+
SELECT
|
|
1056
|
+
*,
|
|
1057
|
+
DATE_TRUNC(
|
|
1058
|
+
{self.time_bucket_size},
|
|
1059
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
1060
|
+
) AS bucket_start_time,
|
|
1061
|
+
1 AS query_count,
|
|
1062
|
+
FROM
|
|
1063
|
+
fingerprinted_queries"""
|
|
1064
|
+
else:
|
|
1065
|
+
raise NotImplementedError(
|
|
1066
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
1067
|
+
)
|
|
677
1068
|
|
|
678
|
-
|
|
1069
|
+
def build_enriched_query_log_query(self) -> str:
|
|
1070
|
+
return f"""\
|
|
679
1071
|
WITH
|
|
680
1072
|
fingerprinted_queries as (
|
|
681
|
-
|
|
682
|
-
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
683
|
-
query_history.query_parameterized_hash as query_fingerprint,
|
|
684
|
-
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
685
|
-
CASE
|
|
686
|
-
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
687
|
-
-- Extract project id and hash it
|
|
688
|
-
THEN CAST(HASH(
|
|
689
|
-
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
690
|
-
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
691
|
-
) AS VARCHAR)
|
|
692
|
-
ELSE NULL
|
|
693
|
-
END as query_secondary_fingerprint
|
|
694
|
-
FROM
|
|
695
|
-
snowflake.account_usage.query_history
|
|
696
|
-
WHERE
|
|
697
|
-
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
698
|
-
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
699
|
-
AND execution_status = 'SUCCESS'
|
|
700
|
-
AND {users_filter or "TRUE"}
|
|
1073
|
+
{self._query_fingerprinted_queries()}
|
|
701
1074
|
)
|
|
702
1075
|
, deduplicated_queries as (
|
|
703
|
-
|
|
704
|
-
*,
|
|
705
|
-
DATE_TRUNC(
|
|
706
|
-
{time_bucket_size},
|
|
707
|
-
CONVERT_TIMEZONE('UTC', start_time)
|
|
708
|
-
) AS bucket_start_time,
|
|
709
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
710
|
-
FROM
|
|
711
|
-
fingerprinted_queries
|
|
712
|
-
QUALIFY
|
|
713
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
1076
|
+
{self._query_deduplicated_queries()}
|
|
714
1077
|
)
|
|
715
1078
|
, raw_access_history AS (
|
|
716
1079
|
SELECT
|
|
717
1080
|
query_id,
|
|
1081
|
+
root_query_id,
|
|
718
1082
|
query_start_time,
|
|
719
1083
|
user_name,
|
|
720
1084
|
direct_objects_accessed,
|
|
@@ -723,21 +1087,23 @@ fingerprinted_queries as (
|
|
|
723
1087
|
FROM
|
|
724
1088
|
snowflake.account_usage.access_history
|
|
725
1089
|
WHERE
|
|
726
|
-
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
727
|
-
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
728
|
-
AND {users_filter
|
|
1090
|
+
query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
1091
|
+
AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
1092
|
+
AND {self.users_filter}
|
|
729
1093
|
AND query_id IN (
|
|
730
1094
|
SELECT query_id FROM deduplicated_queries
|
|
731
1095
|
)
|
|
1096
|
+
AND {self.access_history_database_filter}
|
|
732
1097
|
)
|
|
733
1098
|
, filtered_access_history AS (
|
|
734
1099
|
-- TODO: Add table filter clause.
|
|
735
1100
|
SELECT
|
|
736
1101
|
query_id,
|
|
1102
|
+
root_query_id,
|
|
737
1103
|
query_start_time,
|
|
738
1104
|
ARRAY_SLICE(
|
|
739
1105
|
FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
|
|
740
|
-
0, {
|
|
1106
|
+
0, {self.max_tables_per_query}
|
|
741
1107
|
) as direct_objects_accessed,
|
|
742
1108
|
-- TODO: Drop the columns.baseSources subfield.
|
|
743
1109
|
FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
|
|
@@ -764,6 +1130,7 @@ fingerprinted_queries as (
|
|
|
764
1130
|
q.rows_deleted AS "ROWS_DELETED",
|
|
765
1131
|
q.user_name AS "USER_NAME",
|
|
766
1132
|
q.role_name AS "ROLE_NAME",
|
|
1133
|
+
a.root_query_id,
|
|
767
1134
|
a.direct_objects_accessed,
|
|
768
1135
|
a.objects_modified,
|
|
769
1136
|
a.object_modified_by_ddl
|