acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -45,6 +45,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
45
45
|
SnowflakeColumn,
|
|
46
46
|
SnowflakeDatabase,
|
|
47
47
|
SnowflakeDataDictionary,
|
|
48
|
+
SnowflakeDynamicTable,
|
|
48
49
|
SnowflakeFK,
|
|
49
50
|
SnowflakePK,
|
|
50
51
|
SnowflakeSchema,
|
|
@@ -76,7 +77,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
|
|
|
76
77
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
77
78
|
LINEAGE_EXTRACTION,
|
|
78
79
|
METADATA_EXTRACTION,
|
|
79
|
-
|
|
80
|
+
IngestionHighStage,
|
|
80
81
|
)
|
|
81
82
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
82
83
|
GlobalTags,
|
|
@@ -165,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
165
166
|
|
|
166
167
|
def __init__(
|
|
167
168
|
self,
|
|
168
|
-
config: SnowflakeV2Config,
|
|
169
|
-
report: SnowflakeV2Report,
|
|
169
|
+
config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
|
|
170
|
+
report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
|
|
170
171
|
connection: SnowflakeConnection,
|
|
171
172
|
filters: SnowflakeFilter,
|
|
172
173
|
identifiers: SnowflakeIdentifierBuilder,
|
|
@@ -174,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
174
175
|
profiler: Optional[SnowflakeProfiler],
|
|
175
176
|
aggregator: Optional[SqlParsingAggregator],
|
|
176
177
|
snowsight_url_builder: Optional[SnowsightUrlBuilder],
|
|
178
|
+
fetch_views_from_information_schema: bool = False,
|
|
177
179
|
) -> None:
|
|
178
180
|
self.config: SnowflakeV2Config = config
|
|
179
181
|
self.report: SnowflakeV2Report = report
|
|
@@ -182,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
182
184
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
183
185
|
|
|
184
186
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
185
|
-
connection=self.connection
|
|
187
|
+
connection=self.connection,
|
|
188
|
+
report=self.report,
|
|
189
|
+
fetch_views_from_information_schema=fetch_views_from_information_schema,
|
|
186
190
|
)
|
|
187
191
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
188
192
|
|
|
@@ -356,7 +360,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
356
360
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
357
361
|
|
|
358
362
|
if self.profiler and db_tables:
|
|
359
|
-
with self.report.
|
|
363
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
360
364
|
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
361
365
|
|
|
362
366
|
def _process_db_schemas(
|
|
@@ -437,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
437
441
|
tables = self.fetch_tables_for_schema(
|
|
438
442
|
snowflake_schema, db_name, schema_name
|
|
439
443
|
)
|
|
444
|
+
if self.config.include_views:
|
|
445
|
+
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
446
|
+
|
|
447
|
+
if self.config.include_tables:
|
|
440
448
|
db_tables[schema_name] = tables
|
|
441
449
|
yield from self._process_tables(
|
|
442
450
|
tables, snowflake_schema, db_name, schema_name
|
|
443
451
|
)
|
|
444
452
|
|
|
445
453
|
if self.config.include_views:
|
|
446
|
-
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
447
454
|
yield from self._process_views(
|
|
448
455
|
views, snowflake_schema, db_name, schema_name
|
|
449
456
|
)
|
|
@@ -495,6 +502,22 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
495
502
|
if self.config.include_technical_schema:
|
|
496
503
|
data_reader = self.make_data_reader()
|
|
497
504
|
for table in tables:
|
|
505
|
+
# Handle dynamic table definitions for lineage
|
|
506
|
+
if (
|
|
507
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
508
|
+
and table.definition
|
|
509
|
+
and self.aggregator
|
|
510
|
+
):
|
|
511
|
+
table_identifier = self.identifiers.get_dataset_identifier(
|
|
512
|
+
table.name, schema_name, db_name
|
|
513
|
+
)
|
|
514
|
+
self.aggregator.add_view_definition(
|
|
515
|
+
view_urn=self.identifiers.gen_dataset_urn(table_identifier),
|
|
516
|
+
view_definition=table.definition,
|
|
517
|
+
default_db=db_name,
|
|
518
|
+
default_schema=schema_name,
|
|
519
|
+
)
|
|
520
|
+
|
|
498
521
|
table_wu_generator = self._process_table(
|
|
499
522
|
table, snowflake_schema, db_name
|
|
500
523
|
)
|
|
@@ -935,6 +958,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
935
958
|
}
|
|
936
959
|
)
|
|
937
960
|
|
|
961
|
+
if isinstance(table, SnowflakeDynamicTable):
|
|
962
|
+
if table.target_lag:
|
|
963
|
+
custom_properties["TARGET_LAG"] = table.target_lag
|
|
964
|
+
|
|
938
965
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
939
966
|
custom_properties["IS_SECURE"] = "true"
|
|
940
967
|
|
|
@@ -980,7 +1007,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
980
1007
|
schema_name,
|
|
981
1008
|
db_name,
|
|
982
1009
|
(
|
|
983
|
-
SnowflakeObjectDomain.
|
|
1010
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
1011
|
+
if isinstance(table, SnowflakeTable) and table.is_dynamic
|
|
1012
|
+
else SnowflakeObjectDomain.TABLE
|
|
984
1013
|
if isinstance(table, SnowflakeTable)
|
|
985
1014
|
else SnowflakeObjectDomain.VIEW
|
|
986
1015
|
),
|
|
@@ -1218,7 +1247,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1218
1247
|
# falling back to get tables for schema
|
|
1219
1248
|
if tables is None:
|
|
1220
1249
|
self.report.num_get_tables_for_schema_queries += 1
|
|
1221
|
-
return self.data_dictionary.get_tables_for_schema(
|
|
1250
|
+
return self.data_dictionary.get_tables_for_schema(
|
|
1251
|
+
db_name=db_name,
|
|
1252
|
+
schema_name=schema_name,
|
|
1253
|
+
)
|
|
1222
1254
|
|
|
1223
1255
|
# Some schema may not have any table
|
|
1224
1256
|
return tables.get(schema_name, [])
|
|
@@ -1228,8 +1260,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1228
1260
|
) -> List[SnowflakeView]:
|
|
1229
1261
|
views = self.data_dictionary.get_views_for_database(db_name)
|
|
1230
1262
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1263
|
+
if views is not None:
|
|
1264
|
+
# Some schemas may not have any views
|
|
1265
|
+
return views.get(schema_name, [])
|
|
1266
|
+
|
|
1267
|
+
# Usually this fails when there are too many views in the schema.
|
|
1268
|
+
# Fall back to per-schema queries.
|
|
1269
|
+
self.report.num_get_views_for_schema_queries += 1
|
|
1270
|
+
return self.data_dictionary.get_views_for_schema_using_information_schema(
|
|
1271
|
+
db_name=db_name,
|
|
1272
|
+
schema_name=schema_name,
|
|
1273
|
+
)
|
|
1233
1274
|
|
|
1234
1275
|
def get_columns_for_table(
|
|
1235
1276
|
self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import (
|
|
|
20
20
|
SnowflakeSchemaGenerator,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
23
|
+
SnowflakeFilter,
|
|
23
24
|
SnowflakeIdentifierBuilder,
|
|
24
25
|
)
|
|
25
26
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
@@ -58,7 +59,7 @@ class SnowflakeSummaryReport(SourceReport, BaseTimeWindowReport):
|
|
|
58
59
|
|
|
59
60
|
|
|
60
61
|
@config_class(SnowflakeSummaryConfig)
|
|
61
|
-
@support_status(SupportStatus.
|
|
62
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
62
63
|
class SnowflakeSummarySource(Source):
|
|
63
64
|
def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig):
|
|
64
65
|
super().__init__(ctx)
|
|
@@ -81,6 +82,11 @@ class SnowflakeSummarySource(Source):
|
|
|
81
82
|
profiler=None,
|
|
82
83
|
aggregator=None,
|
|
83
84
|
snowsight_url_builder=None,
|
|
85
|
+
filters=SnowflakeFilter(
|
|
86
|
+
filter_config=self.config,
|
|
87
|
+
structured_reporter=self.report,
|
|
88
|
+
),
|
|
89
|
+
fetch_views_from_information_schema=False, # we haven't enabled this config for SnowflakeSummarySource
|
|
84
90
|
)
|
|
85
91
|
|
|
86
92
|
# Databases.
|
|
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
231
231
|
|
|
232
232
|
with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
|
|
233
233
|
for row in results:
|
|
234
|
-
with
|
|
234
|
+
with (
|
|
235
|
+
fetch_timer.pause(),
|
|
236
|
+
self.report.usage_aggregation.result_skip_timer as skip_timer,
|
|
237
|
+
):
|
|
235
238
|
if results.rownumber is not None and results.rownumber % 1000 == 0:
|
|
236
239
|
logger.debug(f"Processing usage row number {results.rownumber}")
|
|
237
240
|
logger.debug(self.report.usage_aggregation.as_string())
|
|
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
255
258
|
f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
|
|
256
259
|
)
|
|
257
260
|
continue
|
|
258
|
-
with
|
|
261
|
+
with (
|
|
262
|
+
skip_timer.pause(),
|
|
263
|
+
self.report.usage_aggregation.result_map_timer as map_timer,
|
|
264
|
+
):
|
|
259
265
|
wu = self.build_usage_statistics_for_dataset(
|
|
260
266
|
dataset_identifier, row
|
|
261
267
|
)
|
|
@@ -9,6 +9,7 @@ from datahub.emitter.mce_builder import (
|
|
|
9
9
|
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
10
10
|
from datahub.ingestion.api.source import SourceReport
|
|
11
11
|
from datahub.ingestion.source.snowflake.constants import (
|
|
12
|
+
DEFAULT_SNOWFLAKE_DOMAIN,
|
|
12
13
|
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
|
|
13
14
|
SnowflakeCloudProvider,
|
|
14
15
|
SnowflakeObjectDomain,
|
|
@@ -34,16 +35,21 @@ class SnowsightUrlBuilder:
|
|
|
34
35
|
"us-east-1",
|
|
35
36
|
"eu-west-1",
|
|
36
37
|
"eu-central-1",
|
|
37
|
-
"ap-southeast-1",
|
|
38
38
|
"ap-southeast-2",
|
|
39
39
|
]
|
|
40
40
|
|
|
41
41
|
snowsight_base_url: str
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
account_locator: str,
|
|
46
|
+
region: str,
|
|
47
|
+
privatelink: bool = False,
|
|
48
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
49
|
+
):
|
|
44
50
|
cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region)
|
|
45
51
|
self.snowsight_base_url = self.create_snowsight_base_url(
|
|
46
|
-
account_locator, cloud_region_id, cloud, privatelink
|
|
52
|
+
account_locator, cloud_region_id, cloud, privatelink, snowflake_domain
|
|
47
53
|
)
|
|
48
54
|
|
|
49
55
|
@staticmethod
|
|
@@ -52,6 +58,7 @@ class SnowsightUrlBuilder:
|
|
|
52
58
|
cloud_region_id: str,
|
|
53
59
|
cloud: str,
|
|
54
60
|
privatelink: bool = False,
|
|
61
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
55
62
|
) -> str:
|
|
56
63
|
if cloud:
|
|
57
64
|
url_cloud_provider_suffix = f".{cloud}"
|
|
@@ -66,8 +73,14 @@ class SnowsightUrlBuilder:
|
|
|
66
73
|
url_cloud_provider_suffix = ""
|
|
67
74
|
else:
|
|
68
75
|
url_cloud_provider_suffix = f".{cloud}"
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
# Note: Snowsight is always accessed via the public internet (app.snowflake.com)
|
|
77
|
+
# even for accounts using privatelink. Privatelink only applies to database connections,
|
|
78
|
+
# not the Snowsight web UI.
|
|
79
|
+
# Standard Snowsight URL format - works for most regions
|
|
80
|
+
# China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
|
|
81
|
+
# guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
|
|
82
|
+
if snowflake_domain == "snowflakecomputing.cn":
|
|
83
|
+
url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
71
84
|
else:
|
|
72
85
|
url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
73
86
|
return url
|
|
@@ -93,9 +106,20 @@ class SnowsightUrlBuilder:
|
|
|
93
106
|
table_name: str,
|
|
94
107
|
schema_name: str,
|
|
95
108
|
db_name: str,
|
|
96
|
-
domain: Literal[
|
|
109
|
+
domain: Literal[
|
|
110
|
+
SnowflakeObjectDomain.TABLE,
|
|
111
|
+
SnowflakeObjectDomain.VIEW,
|
|
112
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
113
|
+
],
|
|
97
114
|
) -> Optional[str]:
|
|
98
|
-
|
|
115
|
+
# For dynamic tables, use the dynamic-table domain in the URL path
|
|
116
|
+
# Ensure only explicitly dynamic tables use dynamic-table URL path
|
|
117
|
+
url_domain = (
|
|
118
|
+
"dynamic-table"
|
|
119
|
+
if domain == SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
120
|
+
else str(domain)
|
|
121
|
+
)
|
|
122
|
+
return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{url_domain}/{table_name}/"
|
|
99
123
|
|
|
100
124
|
def get_external_url_for_schema(
|
|
101
125
|
self, schema_name: str, db_name: str
|
|
@@ -129,6 +153,7 @@ class SnowflakeFilter:
|
|
|
129
153
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
130
154
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
131
155
|
SnowflakeObjectDomain.STREAM,
|
|
156
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
132
157
|
):
|
|
133
158
|
return False
|
|
134
159
|
if _is_sys_table(dataset_name):
|
|
@@ -160,7 +185,8 @@ class SnowflakeFilter:
|
|
|
160
185
|
return False
|
|
161
186
|
|
|
162
187
|
if dataset_type.lower() in {
|
|
163
|
-
SnowflakeObjectDomain.TABLE
|
|
188
|
+
SnowflakeObjectDomain.TABLE,
|
|
189
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
164
190
|
} and not self.filter_config.table_pattern.allowed(
|
|
165
191
|
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
166
192
|
):
|
|
@@ -325,15 +351,10 @@ class SnowflakeIdentifierBuilder:
|
|
|
325
351
|
user_email: Optional[str],
|
|
326
352
|
) -> str:
|
|
327
353
|
if user_email:
|
|
328
|
-
return self.snowflake_identifier(
|
|
329
|
-
user_email
|
|
330
|
-
if self.identifier_config.email_as_user_identifier is True
|
|
331
|
-
else user_email.split("@")[0]
|
|
332
|
-
)
|
|
354
|
+
return self.snowflake_identifier(user_email)
|
|
333
355
|
return self.snowflake_identifier(
|
|
334
356
|
f"{user_name}@{self.identifier_config.email_domain}"
|
|
335
|
-
if self.identifier_config.
|
|
336
|
-
and self.identifier_config.email_domain is not None
|
|
357
|
+
if self.identifier_config.email_domain is not None
|
|
337
358
|
else user_name
|
|
338
359
|
)
|
|
339
360
|
|
|
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
|
|
|
32
32
|
)
|
|
33
33
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
35
36
|
from datahub.ingestion.source.snowflake.constants import (
|
|
36
37
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
37
38
|
SnowflakeEdition,
|
|
@@ -72,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
72
73
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
73
74
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
74
75
|
RedundantLineageRunSkipHandler,
|
|
76
|
+
RedundantQueriesRunSkipHandler,
|
|
75
77
|
RedundantUsageRunSkipHandler,
|
|
76
78
|
)
|
|
77
79
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -97,7 +99,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
99
|
@support_status(SupportStatus.CERTIFIED)
|
|
98
100
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
99
101
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
100
|
-
@capability(
|
|
102
|
+
@capability(
|
|
103
|
+
SourceCapability.CONTAINERS,
|
|
104
|
+
"Enabled by default",
|
|
105
|
+
subtype_modifier=[
|
|
106
|
+
SourceCapabilityModifier.DATABASE,
|
|
107
|
+
SourceCapabilityModifier.SCHEMA,
|
|
108
|
+
],
|
|
109
|
+
)
|
|
101
110
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
102
111
|
@capability(
|
|
103
112
|
SourceCapability.DATA_PROFILING,
|
|
@@ -118,7 +127,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
118
127
|
)
|
|
119
128
|
@capability(
|
|
120
129
|
SourceCapability.DELETION_DETECTION,
|
|
121
|
-
"
|
|
130
|
+
"Enabled by default via stateful ingestion",
|
|
122
131
|
supported=True,
|
|
123
132
|
)
|
|
124
133
|
@capability(
|
|
@@ -131,6 +140,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
131
140
|
"Optionally enabled via `classification.enabled`",
|
|
132
141
|
supported=True,
|
|
133
142
|
)
|
|
143
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
134
144
|
class SnowflakeV2Source(
|
|
135
145
|
SnowflakeCommonMixin,
|
|
136
146
|
StatefulIngestionSourceBase,
|
|
@@ -162,7 +172,11 @@ class SnowflakeV2Source(
|
|
|
162
172
|
)
|
|
163
173
|
|
|
164
174
|
# For database, schema, tables, views, etc
|
|
165
|
-
self.data_dictionary = SnowflakeDataDictionary(
|
|
175
|
+
self.data_dictionary = SnowflakeDataDictionary(
|
|
176
|
+
connection=self.connection,
|
|
177
|
+
report=self.report,
|
|
178
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
179
|
+
)
|
|
166
180
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
167
181
|
|
|
168
182
|
self.discovered_datasets: Optional[List[str]] = None
|
|
@@ -186,6 +200,7 @@ class SnowflakeV2Source(
|
|
|
186
200
|
),
|
|
187
201
|
generate_usage_statistics=False,
|
|
188
202
|
generate_operations=False,
|
|
203
|
+
generate_queries=self.config.include_queries,
|
|
189
204
|
format_queries=self.config.format_sql_queries,
|
|
190
205
|
is_temp_table=self._is_temp_table,
|
|
191
206
|
is_allowed_table=self._is_allowed_table,
|
|
@@ -193,7 +208,7 @@ class SnowflakeV2Source(
|
|
|
193
208
|
)
|
|
194
209
|
self.report.sql_aggregator = self.aggregator.report
|
|
195
210
|
|
|
196
|
-
if self.config.include_table_lineage:
|
|
211
|
+
if self.config.include_table_lineage and not self.config.use_queries_v2:
|
|
197
212
|
redundant_lineage_run_skip_handler: Optional[
|
|
198
213
|
RedundantLineageRunSkipHandler
|
|
199
214
|
] = None
|
|
@@ -311,6 +326,7 @@ class SnowflakeV2Source(
|
|
|
311
326
|
SourceCapability.PLATFORM_INSTANCE,
|
|
312
327
|
SourceCapability.DOMAINS,
|
|
313
328
|
SourceCapability.DELETION_DETECTION,
|
|
329
|
+
SourceCapability.TEST_CONNECTION,
|
|
314
330
|
)
|
|
315
331
|
]
|
|
316
332
|
|
|
@@ -516,6 +532,7 @@ class SnowflakeV2Source(
|
|
|
516
532
|
snowsight_url_builder=snowsight_url_builder,
|
|
517
533
|
filters=self.filters,
|
|
518
534
|
identifiers=self.identifiers,
|
|
535
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
519
536
|
)
|
|
520
537
|
|
|
521
538
|
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
@@ -573,8 +590,20 @@ class SnowflakeV2Source(
|
|
|
573
590
|
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
574
591
|
schema_resolver = self.aggregator._schema_resolver
|
|
575
592
|
|
|
593
|
+
redundant_queries_run_skip_handler: Optional[
|
|
594
|
+
RedundantQueriesRunSkipHandler
|
|
595
|
+
] = None
|
|
596
|
+
if self.config.enable_stateful_time_window:
|
|
597
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
598
|
+
source=self,
|
|
599
|
+
config=self.config,
|
|
600
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
601
|
+
run_id=self.ctx.run_id,
|
|
602
|
+
)
|
|
603
|
+
|
|
576
604
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
577
605
|
connection=self.connection,
|
|
606
|
+
# TODO: this should be its own section in main recipe
|
|
578
607
|
config=SnowflakeQueriesExtractorConfig(
|
|
579
608
|
window=BaseTimeWindowConfig(
|
|
580
609
|
start_time=self.config.start_time,
|
|
@@ -589,10 +618,15 @@ class SnowflakeV2Source(
|
|
|
589
618
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
590
619
|
user_email_pattern=self.config.user_email_pattern,
|
|
591
620
|
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
621
|
+
pushdown_allow_usernames=self.config.pushdown_allow_usernames,
|
|
622
|
+
query_dedup_strategy=self.config.query_dedup_strategy,
|
|
623
|
+
push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
|
|
624
|
+
additional_database_names_allowlist=self.config.additional_database_names_allowlist,
|
|
592
625
|
),
|
|
593
626
|
structured_report=self.report,
|
|
594
627
|
filters=self.filters,
|
|
595
628
|
identifiers=self.identifiers,
|
|
629
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
596
630
|
schema_resolver=schema_resolver,
|
|
597
631
|
discovered_tables=self.discovered_datasets,
|
|
598
632
|
graph=self.ctx.graph,
|
|
@@ -730,6 +764,7 @@ class SnowflakeV2Source(
|
|
|
730
764
|
# For privatelink, account identifier ends with .privatelink
|
|
731
765
|
# See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls
|
|
732
766
|
privatelink=self.config.account_id.endswith(".privatelink"),
|
|
767
|
+
snowflake_domain=self.config.snowflake_domain,
|
|
733
768
|
)
|
|
734
769
|
|
|
735
770
|
except Exception as e:
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
7
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
8
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
9
|
+
PreparsedQuery,
|
|
10
|
+
UrnStr,
|
|
11
|
+
)
|
|
12
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
13
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class StoredProcCall:
|
|
18
|
+
snowflake_root_query_id: str
|
|
19
|
+
|
|
20
|
+
# Query text will typically be something like:
|
|
21
|
+
# "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
|
|
22
|
+
query_text: str
|
|
23
|
+
|
|
24
|
+
timestamp: datetime
|
|
25
|
+
user: CorpUserUrn
|
|
26
|
+
default_db: str
|
|
27
|
+
default_schema: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class StoredProcExecutionLineage:
|
|
32
|
+
call: StoredProcCall
|
|
33
|
+
|
|
34
|
+
inputs: List[UrnStr]
|
|
35
|
+
outputs: List[UrnStr]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class StoredProcLineageReport:
|
|
40
|
+
num_stored_proc_calls: int = 0
|
|
41
|
+
num_related_queries: int = 0
|
|
42
|
+
num_related_queries_without_proc_call: int = 0
|
|
43
|
+
|
|
44
|
+
# Incremented at generation/build time.
|
|
45
|
+
num_stored_proc_lineage_entries: int = 0
|
|
46
|
+
num_stored_proc_calls_with_no_inputs: int = 0
|
|
47
|
+
num_stored_proc_calls_with_no_outputs: int = 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StoredProcLineageTracker(Closeable):
|
|
51
|
+
"""
|
|
52
|
+
Tracks table-level lineage for Snowflake stored procedures.
|
|
53
|
+
|
|
54
|
+
Stored procedures in Snowflake trigger multiple SQL queries during execution.
|
|
55
|
+
Snowflake assigns each stored procedure call a unique query_id and uses this as the
|
|
56
|
+
root_query_id for all subsequent queries executed within that procedure. This allows
|
|
57
|
+
us to trace which queries belong to a specific stored procedure execution and build
|
|
58
|
+
table-level lineage by aggregating inputs/outputs from all related queries.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, platform: str, shared_connection: Optional[Any] = None):
|
|
62
|
+
self.platform = platform
|
|
63
|
+
self.report = StoredProcLineageReport()
|
|
64
|
+
|
|
65
|
+
# { root_query_id -> StoredProcExecutionLineage }
|
|
66
|
+
self._stored_proc_execution_lineage: FileBackedDict[
|
|
67
|
+
StoredProcExecutionLineage
|
|
68
|
+
] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
|
|
69
|
+
|
|
70
|
+
def add_stored_proc_call(self, call: StoredProcCall) -> None:
|
|
71
|
+
"""Add a stored procedure call to track."""
|
|
72
|
+
self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
|
|
73
|
+
StoredProcExecutionLineage(
|
|
74
|
+
call=call,
|
|
75
|
+
# Will be populated by subsequent queries.
|
|
76
|
+
inputs=[],
|
|
77
|
+
outputs=[],
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
self.report.num_stored_proc_calls += 1
|
|
81
|
+
|
|
82
|
+
def add_related_query(self, query: PreparsedQuery) -> bool:
|
|
83
|
+
"""Add a query that might be related to a stored procedure execution.
|
|
84
|
+
|
|
85
|
+
Returns True if the query was added to a stored procedure execution, False otherwise.
|
|
86
|
+
"""
|
|
87
|
+
snowflake_root_query_id = (query.extra_info or {}).get(
|
|
88
|
+
"snowflake_root_query_id"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if snowflake_root_query_id:
|
|
92
|
+
if snowflake_root_query_id not in self._stored_proc_execution_lineage:
|
|
93
|
+
self.report.num_related_queries_without_proc_call += 1
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
|
|
97
|
+
snowflake_root_query_id
|
|
98
|
+
)
|
|
99
|
+
stored_proc_execution.inputs.extend(query.upstreams)
|
|
100
|
+
if query.downstream is not None:
|
|
101
|
+
stored_proc_execution.outputs.append(query.downstream)
|
|
102
|
+
self.report.num_related_queries += 1
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
|
|
108
|
+
# For stored procedures, we can only get table-level lineage from the audit log.
|
|
109
|
+
# We represent these as PreparsedQuery objects for now. Eventually we'll want to
|
|
110
|
+
# create dataJobInputOutput lineage instead.
|
|
111
|
+
|
|
112
|
+
for stored_proc_execution in self._stored_proc_execution_lineage.values():
|
|
113
|
+
if not stored_proc_execution.inputs:
|
|
114
|
+
self.report.num_stored_proc_calls_with_no_inputs += 1
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if not stored_proc_execution.outputs:
|
|
118
|
+
self.report.num_stored_proc_calls_with_no_outputs += 1
|
|
119
|
+
# Still continue to generate lineage for cases where we have inputs but no outputs
|
|
120
|
+
|
|
121
|
+
for downstream in stored_proc_execution.outputs:
|
|
122
|
+
stored_proc_query_id = get_query_fingerprint(
|
|
123
|
+
stored_proc_execution.call.query_text,
|
|
124
|
+
self.platform,
|
|
125
|
+
fast=True,
|
|
126
|
+
secondary_id=downstream,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
lineage_entry = PreparsedQuery(
|
|
130
|
+
query_id=stored_proc_query_id,
|
|
131
|
+
query_text=stored_proc_execution.call.query_text,
|
|
132
|
+
upstreams=stored_proc_execution.inputs,
|
|
133
|
+
downstream=downstream,
|
|
134
|
+
query_count=0,
|
|
135
|
+
user=stored_proc_execution.call.user,
|
|
136
|
+
timestamp=stored_proc_execution.call.timestamp,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
self.report.num_stored_proc_lineage_entries += 1
|
|
140
|
+
yield lineage_entry
|
|
141
|
+
|
|
142
|
+
def close(self) -> None:
|
|
143
|
+
self._stored_proc_execution_lineage.close()
|