acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
8
8
|
)
|
|
9
9
|
from datahub.utilities.prefix_batch_builder import PrefixGroup
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
SHOW_COMMAND_MAX_PAGE_SIZE = 10000
|
|
12
12
|
SHOW_STREAM_MAX_PAGE_SIZE = 10000
|
|
13
13
|
|
|
14
14
|
|
|
@@ -38,12 +38,23 @@ class SnowflakeQuery:
|
|
|
38
38
|
SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
|
|
39
39
|
SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
|
|
40
40
|
SnowflakeObjectDomain.STREAM.capitalize(),
|
|
41
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
41
42
|
}
|
|
42
43
|
|
|
43
44
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
44
45
|
",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
|
|
45
46
|
)
|
|
46
47
|
|
|
48
|
+
# Domains that can be downstream tables in lineage
|
|
49
|
+
DOWNSTREAM_TABLE_DOMAINS = {
|
|
50
|
+
SnowflakeObjectDomain.TABLE.capitalize(),
|
|
51
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
DOWNSTREAM_TABLE_DOMAINS_FILTER = "({})".format(
|
|
55
|
+
",".join(f"'{domain}'" for domain in DOWNSTREAM_TABLE_DOMAINS)
|
|
56
|
+
)
|
|
57
|
+
|
|
47
58
|
@staticmethod
|
|
48
59
|
def current_account() -> str:
|
|
49
60
|
return "select CURRENT_ACCOUNT()"
|
|
@@ -235,7 +246,7 @@ class SnowflakeQuery:
|
|
|
235
246
|
@staticmethod
|
|
236
247
|
def show_views_for_database(
|
|
237
248
|
db_name: str,
|
|
238
|
-
limit: int =
|
|
249
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
239
250
|
view_pagination_marker: Optional[str] = None,
|
|
240
251
|
) -> str:
|
|
241
252
|
# While there is an information_schema.views view, that only shows the view definition if the role
|
|
@@ -244,7 +255,7 @@ class SnowflakeQuery:
|
|
|
244
255
|
|
|
245
256
|
# SHOW VIEWS can return a maximum of 10000 rows.
|
|
246
257
|
# https://docs.snowflake.com/en/sql-reference/sql/show-views#usage-notes
|
|
247
|
-
assert limit <=
|
|
258
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
248
259
|
|
|
249
260
|
# To work around this, we paginate through the results using the FROM clause.
|
|
250
261
|
from_clause = (
|
|
@@ -255,6 +266,33 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
255
266
|
LIMIT {limit} {from_clause};
|
|
256
267
|
"""
|
|
257
268
|
|
|
269
|
+
@staticmethod
|
|
270
|
+
def get_views_for_database(db_name: str) -> str:
|
|
271
|
+
# We've seen some issues with the `SHOW VIEWS` query,
|
|
272
|
+
# particularly when it requires pagination.
|
|
273
|
+
# This is an experimental alternative query that might be more reliable.
|
|
274
|
+
return f"""\
|
|
275
|
+
SELECT
|
|
276
|
+
TABLE_CATALOG as "VIEW_CATALOG",
|
|
277
|
+
TABLE_SCHEMA as "VIEW_SCHEMA",
|
|
278
|
+
TABLE_NAME as "VIEW_NAME",
|
|
279
|
+
COMMENT,
|
|
280
|
+
VIEW_DEFINITION,
|
|
281
|
+
CREATED,
|
|
282
|
+
LAST_ALTERED,
|
|
283
|
+
IS_SECURE
|
|
284
|
+
FROM "{db_name}".information_schema.views
|
|
285
|
+
WHERE TABLE_CATALOG = '{db_name}'
|
|
286
|
+
AND TABLE_SCHEMA != 'INFORMATION_SCHEMA'
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def get_views_for_schema(db_name: str, schema_name: str) -> str:
|
|
291
|
+
return f"""\
|
|
292
|
+
{SnowflakeQuery.get_views_for_database(db_name).rstrip()}
|
|
293
|
+
AND TABLE_SCHEMA = '{schema_name}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
258
296
|
@staticmethod
|
|
259
297
|
def get_secure_view_definitions() -> str:
|
|
260
298
|
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
@@ -686,7 +724,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
686
724
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
687
725
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
688
726
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
689
|
-
AND downstream_table_domain
|
|
727
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
690
728
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
691
729
|
),
|
|
692
730
|
column_upstream_jobs AS (
|
|
@@ -843,7 +881,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
843
881
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
844
882
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
845
883
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
846
|
-
AND downstream_table_domain
|
|
884
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
847
885
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
848
886
|
),
|
|
849
887
|
table_upstream_jobs_unique AS (
|
|
@@ -940,3 +978,37 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
940
978
|
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
941
979
|
)
|
|
942
980
|
return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
|
|
981
|
+
|
|
982
|
+
@staticmethod
|
|
983
|
+
def show_dynamic_tables_for_database(
|
|
984
|
+
db_name: str,
|
|
985
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
986
|
+
dynamic_table_pagination_marker: Optional[str] = None,
|
|
987
|
+
) -> str:
|
|
988
|
+
"""Get dynamic table definitions using SHOW DYNAMIC TABLES."""
|
|
989
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
990
|
+
|
|
991
|
+
from_clause = (
|
|
992
|
+
f"""FROM '{dynamic_table_pagination_marker}'"""
|
|
993
|
+
if dynamic_table_pagination_marker
|
|
994
|
+
else ""
|
|
995
|
+
)
|
|
996
|
+
return f"""\
|
|
997
|
+
SHOW DYNAMIC TABLES IN DATABASE "{db_name}"
|
|
998
|
+
LIMIT {limit} {from_clause};
|
|
999
|
+
"""
|
|
1000
|
+
|
|
1001
|
+
@staticmethod
|
|
1002
|
+
def get_dynamic_table_graph_history(db_name: str) -> str:
|
|
1003
|
+
"""Get dynamic table dependency information from information schema."""
|
|
1004
|
+
return f"""
|
|
1005
|
+
SELECT
|
|
1006
|
+
name,
|
|
1007
|
+
inputs,
|
|
1008
|
+
target_lag_type,
|
|
1009
|
+
target_lag_sec,
|
|
1010
|
+
scheduling_state,
|
|
1011
|
+
alter_trigger
|
|
1012
|
+
FROM TABLE("{db_name}".INFORMATION_SCHEMA.DYNAMIC_TABLE_GRAPH_HISTORY())
|
|
1013
|
+
ORDER BY name
|
|
1014
|
+
"""
|
|
@@ -9,7 +9,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
|
9
9
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
10
10
|
StatefulIngestionReport,
|
|
11
11
|
)
|
|
12
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
12
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
13
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
14
|
from datahub.utilities.lossy_collections import LossyDict
|
|
@@ -96,7 +95,6 @@ class SnowflakeV2Report(
|
|
|
96
95
|
SnowflakeUsageReport,
|
|
97
96
|
StatefulIngestionReport,
|
|
98
97
|
ClassificationReportMixin,
|
|
99
|
-
IngestionStageReport,
|
|
100
98
|
):
|
|
101
99
|
account_locator: Optional[str] = None
|
|
102
100
|
region: Optional[str] = None
|
|
@@ -128,6 +126,7 @@ class SnowflakeV2Report(
|
|
|
128
126
|
# "Information schema query returned too much data. Please repeat query with more selective predicates.""
|
|
129
127
|
# This will result in overall increase in time complexity
|
|
130
128
|
num_get_tables_for_schema_queries: int = 0
|
|
129
|
+
num_get_views_for_schema_queries: int = 0
|
|
131
130
|
|
|
132
131
|
# these will be non-zero if the user choses to enable the extract_tags = "with_lineage" option, which requires
|
|
133
132
|
# individual queries per object (database, schema, table) and an extra query per table to get the tags on the columns.
|
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
5
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
7
6
|
|
|
7
|
+
from datahub.configuration.env_vars import get_snowflake_schema_parallelism
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
10
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
11
11
|
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
|
|
12
12
|
from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
13
|
-
|
|
13
|
+
SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
|
+
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
16
17
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
17
18
|
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
18
19
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
@@ -21,7 +22,7 @@ from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
|
21
22
|
|
|
22
23
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
24
|
-
SCHEMA_PARALLELISM =
|
|
25
|
+
SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
@dataclass
|
|
@@ -103,6 +104,17 @@ class SnowflakeTable(BaseTable):
|
|
|
103
104
|
return DatasetSubTypes.TABLE
|
|
104
105
|
|
|
105
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class SnowflakeDynamicTable(SnowflakeTable):
|
|
109
|
+
definition: Optional[str] = (
|
|
110
|
+
None # SQL query that defines the dynamic table's content
|
|
111
|
+
)
|
|
112
|
+
target_lag: Optional[str] = None # Refresh frequency (e.g., "1 HOUR", "30 MINUTES")
|
|
113
|
+
|
|
114
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
115
|
+
return DatasetSubTypes.DYNAMIC_TABLE
|
|
116
|
+
|
|
117
|
+
|
|
106
118
|
@dataclass
|
|
107
119
|
class SnowflakeView(BaseView):
|
|
108
120
|
materialized: bool = False
|
|
@@ -226,10 +238,17 @@ class _SnowflakeTagCache:
|
|
|
226
238
|
|
|
227
239
|
|
|
228
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
229
|
-
def __init__(
|
|
241
|
+
def __init__(
|
|
242
|
+
self,
|
|
243
|
+
connection: SnowflakeConnection,
|
|
244
|
+
report: SnowflakeV2Report,
|
|
245
|
+
fetch_views_from_information_schema: bool = False,
|
|
246
|
+
) -> None:
|
|
230
247
|
self.connection = connection
|
|
248
|
+
self.report = report
|
|
249
|
+
self._fetch_views_from_information_schema = fetch_views_from_information_schema
|
|
231
250
|
|
|
232
|
-
def as_obj(self) -> Dict[str,
|
|
251
|
+
def as_obj(self) -> Dict[str, Any]:
|
|
233
252
|
# TODO: Move this into a proper report type that gets computed.
|
|
234
253
|
|
|
235
254
|
# Reports how many times we reset in-memory `functools.lru_cache` caches of data,
|
|
@@ -245,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
245
264
|
self.get_fk_constraints_for_schema,
|
|
246
265
|
]
|
|
247
266
|
|
|
248
|
-
report = {
|
|
267
|
+
report: Dict[str, Any] = {
|
|
268
|
+
"fetch_views_from_information_schema": self._fetch_views_from_information_schema,
|
|
269
|
+
}
|
|
249
270
|
for func in lru_cache_functions:
|
|
250
271
|
report[func.__name__] = func.cache_info()._asdict() # type: ignore
|
|
251
272
|
return report
|
|
@@ -355,8 +376,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
355
376
|
if table["TABLE_SCHEMA"] not in tables:
|
|
356
377
|
tables[table["TABLE_SCHEMA"]] = []
|
|
357
378
|
|
|
379
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
380
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
381
|
+
|
|
358
382
|
tables[table["TABLE_SCHEMA"]].append(
|
|
359
|
-
|
|
383
|
+
table_cls(
|
|
360
384
|
name=table["TABLE_NAME"],
|
|
361
385
|
type=table["TABLE_TYPE"],
|
|
362
386
|
created=table["CREATED"],
|
|
@@ -365,11 +389,15 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
365
389
|
rows_count=table["ROW_COUNT"],
|
|
366
390
|
comment=table["COMMENT"],
|
|
367
391
|
clustering_key=table["CLUSTERING_KEY"],
|
|
368
|
-
is_dynamic=
|
|
392
|
+
is_dynamic=is_dynamic,
|
|
369
393
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
370
394
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
371
395
|
)
|
|
372
396
|
)
|
|
397
|
+
|
|
398
|
+
# Populate dynamic table definitions
|
|
399
|
+
self.populate_dynamic_table_definitions(tables, db_name)
|
|
400
|
+
|
|
373
401
|
return tables
|
|
374
402
|
|
|
375
403
|
def get_tables_for_schema(
|
|
@@ -382,8 +410,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
382
410
|
)
|
|
383
411
|
|
|
384
412
|
for table in cur:
|
|
413
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
414
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
415
|
+
|
|
385
416
|
tables.append(
|
|
386
|
-
|
|
417
|
+
table_cls(
|
|
387
418
|
name=table["TABLE_NAME"],
|
|
388
419
|
type=table["TABLE_TYPE"],
|
|
389
420
|
created=table["CREATED"],
|
|
@@ -392,16 +423,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
392
423
|
rows_count=table["ROW_COUNT"],
|
|
393
424
|
comment=table["COMMENT"],
|
|
394
425
|
clustering_key=table["CLUSTERING_KEY"],
|
|
395
|
-
is_dynamic=
|
|
426
|
+
is_dynamic=is_dynamic,
|
|
396
427
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
397
428
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
398
429
|
)
|
|
399
430
|
)
|
|
431
|
+
|
|
432
|
+
# Populate dynamic table definitions for just this schema
|
|
433
|
+
schema_tables = {schema_name: tables}
|
|
434
|
+
self.populate_dynamic_table_definitions(schema_tables, db_name)
|
|
435
|
+
|
|
400
436
|
return tables
|
|
401
437
|
|
|
402
438
|
@serialized_lru_cache(maxsize=1)
|
|
403
|
-
def get_views_for_database(
|
|
404
|
-
|
|
439
|
+
def get_views_for_database(
|
|
440
|
+
self, db_name: str
|
|
441
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
442
|
+
if self._fetch_views_from_information_schema:
|
|
443
|
+
return self._get_views_for_database_using_information_schema(db_name)
|
|
444
|
+
else:
|
|
445
|
+
return self._get_views_for_database_using_show(db_name)
|
|
446
|
+
|
|
447
|
+
def _get_views_for_database_using_show(
|
|
448
|
+
self, db_name: str
|
|
449
|
+
) -> Dict[str, List[SnowflakeView]]:
|
|
450
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
405
451
|
|
|
406
452
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
407
453
|
|
|
@@ -431,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
431
477
|
SnowflakeView(
|
|
432
478
|
name=view_name,
|
|
433
479
|
created=view["created_on"],
|
|
434
|
-
# last_altered=table["last_altered"],
|
|
435
480
|
comment=view["comment"],
|
|
436
481
|
view_definition=view["text"],
|
|
437
|
-
last_altered=view["created_on"],
|
|
482
|
+
last_altered=view["created_on"], # TODO: This is not correct.
|
|
438
483
|
materialized=(
|
|
439
484
|
view.get("is_materialized", "false").lower() == "true"
|
|
440
485
|
),
|
|
@@ -449,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
449
494
|
)
|
|
450
495
|
view_pagination_marker = view_name
|
|
451
496
|
|
|
497
|
+
# Because this is in a cached function, this will only log once per database.
|
|
498
|
+
view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
|
|
499
|
+
logger.info(
|
|
500
|
+
f"Finished fetching views in {db_name}; counts by schema {view_counts}"
|
|
501
|
+
)
|
|
502
|
+
return views
|
|
503
|
+
|
|
504
|
+
def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
|
|
505
|
+
schema_name = row["VIEW_SCHEMA"]
|
|
506
|
+
view_definition = row.get("VIEW_DEFINITION")
|
|
507
|
+
fragment_view_definition = (
|
|
508
|
+
view_definition[:50].strip() if view_definition else None
|
|
509
|
+
)
|
|
510
|
+
logger.info(
|
|
511
|
+
f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return schema_name, SnowflakeView(
|
|
515
|
+
name=row["VIEW_NAME"],
|
|
516
|
+
created=row["CREATED"],
|
|
517
|
+
comment=row["COMMENT"],
|
|
518
|
+
view_definition=view_definition,
|
|
519
|
+
last_altered=row["LAST_ALTERED"],
|
|
520
|
+
is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
|
|
521
|
+
# TODO: This doesn't work for materialized views.
|
|
522
|
+
materialized=False,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _maybe_populate_empty_view_definitions(
|
|
526
|
+
self,
|
|
527
|
+
db_name: str,
|
|
528
|
+
schema_name: str,
|
|
529
|
+
views_with_empty_definition: List[SnowflakeView],
|
|
530
|
+
) -> List[SnowflakeView]:
|
|
531
|
+
if not views_with_empty_definition:
|
|
532
|
+
return []
|
|
533
|
+
|
|
534
|
+
view_names = [view.name for view in views_with_empty_definition]
|
|
535
|
+
batches = [
|
|
536
|
+
batch[0]
|
|
537
|
+
for batch in build_prefix_batches(
|
|
538
|
+
view_names, max_batch_size=1000, max_groups_in_batch=1
|
|
539
|
+
)
|
|
540
|
+
if batch
|
|
541
|
+
# Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
view_map: Dict[str, SnowflakeView] = {
|
|
545
|
+
view.name: view for view in views_with_empty_definition
|
|
546
|
+
}
|
|
547
|
+
views_found_count = 0
|
|
548
|
+
|
|
549
|
+
logger.info(
|
|
550
|
+
f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
|
|
551
|
+
f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
for batch_index, prefix_group in enumerate(batches):
|
|
555
|
+
query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
|
|
556
|
+
logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
cur = self.connection.query(query)
|
|
560
|
+
for row in cur:
|
|
561
|
+
view_name = row["name"]
|
|
562
|
+
if view_name in view_map:
|
|
563
|
+
view_definition = row.get("text")
|
|
564
|
+
if view_definition: # Ensure definition is not None or empty
|
|
565
|
+
view_map[view_name].view_definition = view_definition
|
|
566
|
+
views_found_count += 1
|
|
567
|
+
logger.debug(
|
|
568
|
+
f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
|
|
569
|
+
)
|
|
570
|
+
# If all targeted views are found, we could theoretically break early,
|
|
571
|
+
# but SHOW VIEWS doesn't guarantee order, so we must process all results.
|
|
572
|
+
else:
|
|
573
|
+
logger.warning(
|
|
574
|
+
f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(
|
|
579
|
+
f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
|
|
580
|
+
exc_info=e,
|
|
581
|
+
)
|
|
582
|
+
# Returning the original list; some views might still be missing definitions.
|
|
583
|
+
# This also means subsequent batches for this schema (in this call) are skipped.
|
|
584
|
+
return views_with_empty_definition
|
|
585
|
+
|
|
586
|
+
logger.info(
|
|
587
|
+
f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
|
|
588
|
+
f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if views_found_count < len(view_map):
|
|
592
|
+
missing_count = len(view_map) - views_found_count
|
|
593
|
+
logger.warning(
|
|
594
|
+
f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
|
|
595
|
+
)
|
|
596
|
+
# The SnowflakeView objects in the original list were modified in place via view_map
|
|
597
|
+
return views_with_empty_definition
|
|
598
|
+
|
|
599
|
+
def _get_views_for_database_using_information_schema(
|
|
600
|
+
self, db_name: str
|
|
601
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
602
|
+
try:
|
|
603
|
+
cur = self.connection.query(
|
|
604
|
+
SnowflakeQuery.get_views_for_database(db_name),
|
|
605
|
+
)
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
|
|
608
|
+
# Error - Information schema query returned too much data. Please repeat query with more selective predicates.
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
views: Dict[str, List[SnowflakeView]] = {}
|
|
612
|
+
views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
|
|
613
|
+
|
|
614
|
+
for row in cur:
|
|
615
|
+
schema_name, view = self._map_view(db_name, row)
|
|
616
|
+
if view.view_definition is None or view.view_definition == "":
|
|
617
|
+
views_with_empty_definition.setdefault(schema_name, []).append(view)
|
|
618
|
+
else:
|
|
619
|
+
views.setdefault(schema_name, []).append(view)
|
|
620
|
+
|
|
621
|
+
for schema_name, empty_views in views_with_empty_definition.items():
|
|
622
|
+
updated_views = self._maybe_populate_empty_view_definitions(
|
|
623
|
+
db_name, schema_name, empty_views
|
|
624
|
+
)
|
|
625
|
+
views.setdefault(schema_name, []).extend(updated_views)
|
|
626
|
+
|
|
627
|
+
return views
|
|
628
|
+
|
|
629
|
+
def get_views_for_schema_using_information_schema(
|
|
630
|
+
self, *, schema_name: str, db_name: str
|
|
631
|
+
) -> List[SnowflakeView]:
|
|
632
|
+
cur = self.connection.query(
|
|
633
|
+
SnowflakeQuery.get_views_for_schema(
|
|
634
|
+
db_name=db_name, schema_name=schema_name
|
|
635
|
+
),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
views: List[SnowflakeView] = []
|
|
639
|
+
views_with_empty_definition: List[SnowflakeView] = []
|
|
640
|
+
|
|
641
|
+
for row in cur:
|
|
642
|
+
schema_name, view = self._map_view(db_name, row)
|
|
643
|
+
if view.view_definition is None or view.view_definition == "":
|
|
644
|
+
views_with_empty_definition.append(view)
|
|
645
|
+
else:
|
|
646
|
+
views.append(view)
|
|
647
|
+
|
|
648
|
+
if views_with_empty_definition:
|
|
649
|
+
updated_empty_views = self._maybe_populate_empty_view_definitions(
|
|
650
|
+
db_name, schema_name, views_with_empty_definition
|
|
651
|
+
)
|
|
652
|
+
views.extend(updated_empty_views)
|
|
653
|
+
|
|
452
654
|
return views
|
|
453
655
|
|
|
454
656
|
@serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
|
|
@@ -660,7 +862,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
660
862
|
def get_streams_for_database(
|
|
661
863
|
self, db_name: str
|
|
662
864
|
) -> Dict[str, List[SnowflakeStream]]:
|
|
663
|
-
page_limit =
|
|
865
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
664
866
|
|
|
665
867
|
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
666
868
|
|
|
@@ -743,3 +945,137 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
743
945
|
)
|
|
744
946
|
)
|
|
745
947
|
return procedures
|
|
948
|
+
|
|
949
|
+
@serialized_lru_cache(maxsize=1)
|
|
950
|
+
def get_dynamic_table_graph_info(self, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
951
|
+
"""Get dynamic table dependency information from information schema."""
|
|
952
|
+
dt_graph_info: Dict[str, Dict[str, Any]] = {}
|
|
953
|
+
try:
|
|
954
|
+
cur = self.connection.query(
|
|
955
|
+
SnowflakeQuery.get_dynamic_table_graph_history(db_name)
|
|
956
|
+
)
|
|
957
|
+
for row in cur:
|
|
958
|
+
dt_name = row["NAME"]
|
|
959
|
+
dt_graph_info[dt_name] = {
|
|
960
|
+
"inputs": row.get("INPUTS"),
|
|
961
|
+
"target_lag_type": row.get("TARGET_LAG_TYPE"),
|
|
962
|
+
"target_lag_sec": row.get("TARGET_LAG_SEC"),
|
|
963
|
+
"scheduling_state": row.get("SCHEDULING_STATE"),
|
|
964
|
+
"alter_trigger": row.get("ALTER_TRIGGER"),
|
|
965
|
+
}
|
|
966
|
+
logger.debug(
|
|
967
|
+
f"Successfully retrieved graph info for {len(dt_graph_info)} dynamic tables in {db_name}"
|
|
968
|
+
)
|
|
969
|
+
except Exception as e:
|
|
970
|
+
self.report.warning(
|
|
971
|
+
"Failed to get dynamic table graph history",
|
|
972
|
+
db_name,
|
|
973
|
+
exc=e,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
return dt_graph_info
|
|
977
|
+
|
|
978
|
+
@serialized_lru_cache(maxsize=1)
|
|
979
|
+
def get_dynamic_tables_with_definitions(
|
|
980
|
+
self, db_name: str
|
|
981
|
+
) -> Dict[str, List[SnowflakeDynamicTable]]:
|
|
982
|
+
"""Get dynamic tables with their definitions using SHOW DYNAMIC TABLES."""
|
|
983
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
984
|
+
dynamic_tables: Dict[str, List[SnowflakeDynamicTable]] = {}
|
|
985
|
+
|
|
986
|
+
# Get graph/dependency information (pass db_name)
|
|
987
|
+
dt_graph_info = self.get_dynamic_table_graph_info(db_name)
|
|
988
|
+
|
|
989
|
+
first_iteration = True
|
|
990
|
+
dt_pagination_marker: Optional[str] = None
|
|
991
|
+
|
|
992
|
+
while first_iteration or dt_pagination_marker is not None:
|
|
993
|
+
try:
|
|
994
|
+
cur = self.connection.query(
|
|
995
|
+
SnowflakeQuery.show_dynamic_tables_for_database(
|
|
996
|
+
db_name,
|
|
997
|
+
limit=page_limit,
|
|
998
|
+
dynamic_table_pagination_marker=dt_pagination_marker,
|
|
999
|
+
)
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
first_iteration = False
|
|
1003
|
+
dt_pagination_marker = None
|
|
1004
|
+
result_set_size = 0
|
|
1005
|
+
|
|
1006
|
+
for dt in cur:
|
|
1007
|
+
result_set_size += 1
|
|
1008
|
+
|
|
1009
|
+
dt_name = dt["name"]
|
|
1010
|
+
schema_name = dt["schema_name"]
|
|
1011
|
+
|
|
1012
|
+
if schema_name not in dynamic_tables:
|
|
1013
|
+
dynamic_tables[schema_name] = []
|
|
1014
|
+
|
|
1015
|
+
# Get definition from SHOW result
|
|
1016
|
+
definition = dt.get("text")
|
|
1017
|
+
|
|
1018
|
+
# Get target lag from SHOW result or graph info
|
|
1019
|
+
target_lag = dt.get("target_lag")
|
|
1020
|
+
if not target_lag and dt_graph_info:
|
|
1021
|
+
qualified_name = f"{db_name}.{schema_name}.{dt_name}"
|
|
1022
|
+
graph_info = dt_graph_info.get(qualified_name, {})
|
|
1023
|
+
if graph_info.get("target_lag_type") and graph_info.get(
|
|
1024
|
+
"target_lag_sec"
|
|
1025
|
+
):
|
|
1026
|
+
target_lag = f"{graph_info['target_lag_sec']} {graph_info['target_lag_type']}"
|
|
1027
|
+
|
|
1028
|
+
dynamic_tables[schema_name].append(
|
|
1029
|
+
SnowflakeDynamicTable(
|
|
1030
|
+
name=dt_name,
|
|
1031
|
+
created=dt["created_on"],
|
|
1032
|
+
last_altered=dt.get("created_on"),
|
|
1033
|
+
size_in_bytes=dt.get("bytes", 0),
|
|
1034
|
+
rows_count=dt.get("rows", 0),
|
|
1035
|
+
comment=dt.get("comment"),
|
|
1036
|
+
definition=definition,
|
|
1037
|
+
target_lag=target_lag,
|
|
1038
|
+
is_dynamic=True,
|
|
1039
|
+
type="DYNAMIC TABLE",
|
|
1040
|
+
)
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
if result_set_size >= page_limit:
|
|
1044
|
+
logger.info(
|
|
1045
|
+
f"Fetching next page of dynamic tables for {db_name} - after {dt_name}"
|
|
1046
|
+
)
|
|
1047
|
+
dt_pagination_marker = dt_name
|
|
1048
|
+
|
|
1049
|
+
except Exception as e:
|
|
1050
|
+
logger.debug(
|
|
1051
|
+
f"Failed to get dynamic tables for database {db_name}: {e}"
|
|
1052
|
+
)
|
|
1053
|
+
break
|
|
1054
|
+
|
|
1055
|
+
return dynamic_tables
|
|
1056
|
+
|
|
1057
|
+
def populate_dynamic_table_definitions(
|
|
1058
|
+
self, tables: Dict[str, List[SnowflakeTable]], db_name: str
|
|
1059
|
+
) -> None:
|
|
1060
|
+
"""Populate dynamic table definitions for tables that are marked as dynamic."""
|
|
1061
|
+
try:
|
|
1062
|
+
# Get dynamic tables with definitions from SHOW command
|
|
1063
|
+
dt_with_definitions = self.get_dynamic_tables_with_definitions(db_name)
|
|
1064
|
+
|
|
1065
|
+
for schema_name, table_list in tables.items():
|
|
1066
|
+
for table in table_list:
|
|
1067
|
+
if (
|
|
1068
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
1069
|
+
and table.definition is None
|
|
1070
|
+
):
|
|
1071
|
+
# Find matching dynamic table from SHOW results
|
|
1072
|
+
show_dt_list = dt_with_definitions.get(schema_name, [])
|
|
1073
|
+
for show_dt in show_dt_list:
|
|
1074
|
+
if show_dt.name == table.name:
|
|
1075
|
+
table.definition = show_dt.definition
|
|
1076
|
+
table.target_lag = show_dt.target_lag
|
|
1077
|
+
break
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
logger.debug(
|
|
1080
|
+
f"Failed to populate dynamic table definitions for {db_name}: {e}"
|
|
1081
|
+
)
|