acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,26 +1,28 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
5
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
7
6
|
|
|
7
|
+
from datahub.configuration.env_vars import get_snowflake_schema_parallelism
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
10
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
11
11
|
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
|
|
12
12
|
from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
13
|
-
|
|
13
|
+
SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
|
+
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
16
17
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
18
|
+
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
17
19
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
18
20
|
from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
|
|
19
21
|
from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
20
22
|
|
|
21
23
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
22
24
|
|
|
23
|
-
SCHEMA_PARALLELISM =
|
|
25
|
+
SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
@dataclass
|
|
@@ -102,6 +104,17 @@ class SnowflakeTable(BaseTable):
|
|
|
102
104
|
return DatasetSubTypes.TABLE
|
|
103
105
|
|
|
104
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class SnowflakeDynamicTable(SnowflakeTable):
|
|
109
|
+
definition: Optional[str] = (
|
|
110
|
+
None # SQL query that defines the dynamic table's content
|
|
111
|
+
)
|
|
112
|
+
target_lag: Optional[str] = None # Refresh frequency (e.g., "1 HOUR", "30 MINUTES")
|
|
113
|
+
|
|
114
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
115
|
+
return DatasetSubTypes.DYNAMIC_TABLE
|
|
116
|
+
|
|
117
|
+
|
|
105
118
|
@dataclass
|
|
106
119
|
class SnowflakeView(BaseView):
|
|
107
120
|
materialized: bool = False
|
|
@@ -225,10 +238,17 @@ class _SnowflakeTagCache:
|
|
|
225
238
|
|
|
226
239
|
|
|
227
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
228
|
-
def __init__(
|
|
241
|
+
def __init__(
|
|
242
|
+
self,
|
|
243
|
+
connection: SnowflakeConnection,
|
|
244
|
+
report: SnowflakeV2Report,
|
|
245
|
+
fetch_views_from_information_schema: bool = False,
|
|
246
|
+
) -> None:
|
|
229
247
|
self.connection = connection
|
|
248
|
+
self.report = report
|
|
249
|
+
self._fetch_views_from_information_schema = fetch_views_from_information_schema
|
|
230
250
|
|
|
231
|
-
def as_obj(self) -> Dict[str,
|
|
251
|
+
def as_obj(self) -> Dict[str, Any]:
|
|
232
252
|
# TODO: Move this into a proper report type that gets computed.
|
|
233
253
|
|
|
234
254
|
# Reports how many times we reset in-memory `functools.lru_cache` caches of data,
|
|
@@ -244,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
244
264
|
self.get_fk_constraints_for_schema,
|
|
245
265
|
]
|
|
246
266
|
|
|
247
|
-
report = {
|
|
267
|
+
report: Dict[str, Any] = {
|
|
268
|
+
"fetch_views_from_information_schema": self._fetch_views_from_information_schema,
|
|
269
|
+
}
|
|
248
270
|
for func in lru_cache_functions:
|
|
249
271
|
report[func.__name__] = func.cache_info()._asdict() # type: ignore
|
|
250
272
|
return report
|
|
@@ -354,8 +376,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
354
376
|
if table["TABLE_SCHEMA"] not in tables:
|
|
355
377
|
tables[table["TABLE_SCHEMA"]] = []
|
|
356
378
|
|
|
379
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
380
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
381
|
+
|
|
357
382
|
tables[table["TABLE_SCHEMA"]].append(
|
|
358
|
-
|
|
383
|
+
table_cls(
|
|
359
384
|
name=table["TABLE_NAME"],
|
|
360
385
|
type=table["TABLE_TYPE"],
|
|
361
386
|
created=table["CREATED"],
|
|
@@ -364,11 +389,15 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
364
389
|
rows_count=table["ROW_COUNT"],
|
|
365
390
|
comment=table["COMMENT"],
|
|
366
391
|
clustering_key=table["CLUSTERING_KEY"],
|
|
367
|
-
is_dynamic=
|
|
392
|
+
is_dynamic=is_dynamic,
|
|
368
393
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
369
394
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
370
395
|
)
|
|
371
396
|
)
|
|
397
|
+
|
|
398
|
+
# Populate dynamic table definitions
|
|
399
|
+
self.populate_dynamic_table_definitions(tables, db_name)
|
|
400
|
+
|
|
372
401
|
return tables
|
|
373
402
|
|
|
374
403
|
def get_tables_for_schema(
|
|
@@ -381,8 +410,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
381
410
|
)
|
|
382
411
|
|
|
383
412
|
for table in cur:
|
|
413
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
414
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
415
|
+
|
|
384
416
|
tables.append(
|
|
385
|
-
|
|
417
|
+
table_cls(
|
|
386
418
|
name=table["TABLE_NAME"],
|
|
387
419
|
type=table["TABLE_TYPE"],
|
|
388
420
|
created=table["CREATED"],
|
|
@@ -391,16 +423,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
391
423
|
rows_count=table["ROW_COUNT"],
|
|
392
424
|
comment=table["COMMENT"],
|
|
393
425
|
clustering_key=table["CLUSTERING_KEY"],
|
|
394
|
-
is_dynamic=
|
|
426
|
+
is_dynamic=is_dynamic,
|
|
395
427
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
396
428
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
397
429
|
)
|
|
398
430
|
)
|
|
431
|
+
|
|
432
|
+
# Populate dynamic table definitions for just this schema
|
|
433
|
+
schema_tables = {schema_name: tables}
|
|
434
|
+
self.populate_dynamic_table_definitions(schema_tables, db_name)
|
|
435
|
+
|
|
399
436
|
return tables
|
|
400
437
|
|
|
401
438
|
@serialized_lru_cache(maxsize=1)
|
|
402
|
-
def get_views_for_database(
|
|
403
|
-
|
|
439
|
+
def get_views_for_database(
|
|
440
|
+
self, db_name: str
|
|
441
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
442
|
+
if self._fetch_views_from_information_schema:
|
|
443
|
+
return self._get_views_for_database_using_information_schema(db_name)
|
|
444
|
+
else:
|
|
445
|
+
return self._get_views_for_database_using_show(db_name)
|
|
446
|
+
|
|
447
|
+
def _get_views_for_database_using_show(
|
|
448
|
+
self, db_name: str
|
|
449
|
+
) -> Dict[str, List[SnowflakeView]]:
|
|
450
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
404
451
|
|
|
405
452
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
406
453
|
|
|
@@ -430,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
430
477
|
SnowflakeView(
|
|
431
478
|
name=view_name,
|
|
432
479
|
created=view["created_on"],
|
|
433
|
-
# last_altered=table["last_altered"],
|
|
434
480
|
comment=view["comment"],
|
|
435
481
|
view_definition=view["text"],
|
|
436
|
-
last_altered=view["created_on"],
|
|
482
|
+
last_altered=view["created_on"], # TODO: This is not correct.
|
|
437
483
|
materialized=(
|
|
438
484
|
view.get("is_materialized", "false").lower() == "true"
|
|
439
485
|
),
|
|
@@ -448,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
448
494
|
)
|
|
449
495
|
view_pagination_marker = view_name
|
|
450
496
|
|
|
497
|
+
# Because this is in a cached function, this will only log once per database.
|
|
498
|
+
view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
|
|
499
|
+
logger.info(
|
|
500
|
+
f"Finished fetching views in {db_name}; counts by schema {view_counts}"
|
|
501
|
+
)
|
|
502
|
+
return views
|
|
503
|
+
|
|
504
|
+
def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
|
|
505
|
+
schema_name = row["VIEW_SCHEMA"]
|
|
506
|
+
view_definition = row.get("VIEW_DEFINITION")
|
|
507
|
+
fragment_view_definition = (
|
|
508
|
+
view_definition[:50].strip() if view_definition else None
|
|
509
|
+
)
|
|
510
|
+
logger.info(
|
|
511
|
+
f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return schema_name, SnowflakeView(
|
|
515
|
+
name=row["VIEW_NAME"],
|
|
516
|
+
created=row["CREATED"],
|
|
517
|
+
comment=row["COMMENT"],
|
|
518
|
+
view_definition=view_definition,
|
|
519
|
+
last_altered=row["LAST_ALTERED"],
|
|
520
|
+
is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
|
|
521
|
+
# TODO: This doesn't work for materialized views.
|
|
522
|
+
materialized=False,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _maybe_populate_empty_view_definitions(
|
|
526
|
+
self,
|
|
527
|
+
db_name: str,
|
|
528
|
+
schema_name: str,
|
|
529
|
+
views_with_empty_definition: List[SnowflakeView],
|
|
530
|
+
) -> List[SnowflakeView]:
|
|
531
|
+
if not views_with_empty_definition:
|
|
532
|
+
return []
|
|
533
|
+
|
|
534
|
+
view_names = [view.name for view in views_with_empty_definition]
|
|
535
|
+
batches = [
|
|
536
|
+
batch[0]
|
|
537
|
+
for batch in build_prefix_batches(
|
|
538
|
+
view_names, max_batch_size=1000, max_groups_in_batch=1
|
|
539
|
+
)
|
|
540
|
+
if batch
|
|
541
|
+
# Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
view_map: Dict[str, SnowflakeView] = {
|
|
545
|
+
view.name: view for view in views_with_empty_definition
|
|
546
|
+
}
|
|
547
|
+
views_found_count = 0
|
|
548
|
+
|
|
549
|
+
logger.info(
|
|
550
|
+
f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
|
|
551
|
+
f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
for batch_index, prefix_group in enumerate(batches):
|
|
555
|
+
query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
|
|
556
|
+
logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
cur = self.connection.query(query)
|
|
560
|
+
for row in cur:
|
|
561
|
+
view_name = row["name"]
|
|
562
|
+
if view_name in view_map:
|
|
563
|
+
view_definition = row.get("text")
|
|
564
|
+
if view_definition: # Ensure definition is not None or empty
|
|
565
|
+
view_map[view_name].view_definition = view_definition
|
|
566
|
+
views_found_count += 1
|
|
567
|
+
logger.debug(
|
|
568
|
+
f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
|
|
569
|
+
)
|
|
570
|
+
# If all targeted views are found, we could theoretically break early,
|
|
571
|
+
# but SHOW VIEWS doesn't guarantee order, so we must process all results.
|
|
572
|
+
else:
|
|
573
|
+
logger.warning(
|
|
574
|
+
f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(
|
|
579
|
+
f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
|
|
580
|
+
exc_info=e,
|
|
581
|
+
)
|
|
582
|
+
# Returning the original list; some views might still be missing definitions.
|
|
583
|
+
# This also means subsequent batches for this schema (in this call) are skipped.
|
|
584
|
+
return views_with_empty_definition
|
|
585
|
+
|
|
586
|
+
logger.info(
|
|
587
|
+
f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
|
|
588
|
+
f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if views_found_count < len(view_map):
|
|
592
|
+
missing_count = len(view_map) - views_found_count
|
|
593
|
+
logger.warning(
|
|
594
|
+
f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
|
|
595
|
+
)
|
|
596
|
+
# The SnowflakeView objects in the original list were modified in place via view_map
|
|
597
|
+
return views_with_empty_definition
|
|
598
|
+
|
|
599
|
+
def _get_views_for_database_using_information_schema(
|
|
600
|
+
self, db_name: str
|
|
601
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
602
|
+
try:
|
|
603
|
+
cur = self.connection.query(
|
|
604
|
+
SnowflakeQuery.get_views_for_database(db_name),
|
|
605
|
+
)
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
|
|
608
|
+
# Error - Information schema query returned too much data. Please repeat query with more selective predicates.
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
views: Dict[str, List[SnowflakeView]] = {}
|
|
612
|
+
views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
|
|
613
|
+
|
|
614
|
+
for row in cur:
|
|
615
|
+
schema_name, view = self._map_view(db_name, row)
|
|
616
|
+
if view.view_definition is None or view.view_definition == "":
|
|
617
|
+
views_with_empty_definition.setdefault(schema_name, []).append(view)
|
|
618
|
+
else:
|
|
619
|
+
views.setdefault(schema_name, []).append(view)
|
|
620
|
+
|
|
621
|
+
for schema_name, empty_views in views_with_empty_definition.items():
|
|
622
|
+
updated_views = self._maybe_populate_empty_view_definitions(
|
|
623
|
+
db_name, schema_name, empty_views
|
|
624
|
+
)
|
|
625
|
+
views.setdefault(schema_name, []).extend(updated_views)
|
|
626
|
+
|
|
627
|
+
return views
|
|
628
|
+
|
|
629
|
+
def get_views_for_schema_using_information_schema(
|
|
630
|
+
self, *, schema_name: str, db_name: str
|
|
631
|
+
) -> List[SnowflakeView]:
|
|
632
|
+
cur = self.connection.query(
|
|
633
|
+
SnowflakeQuery.get_views_for_schema(
|
|
634
|
+
db_name=db_name, schema_name=schema_name
|
|
635
|
+
),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
views: List[SnowflakeView] = []
|
|
639
|
+
views_with_empty_definition: List[SnowflakeView] = []
|
|
640
|
+
|
|
641
|
+
for row in cur:
|
|
642
|
+
schema_name, view = self._map_view(db_name, row)
|
|
643
|
+
if view.view_definition is None or view.view_definition == "":
|
|
644
|
+
views_with_empty_definition.append(view)
|
|
645
|
+
else:
|
|
646
|
+
views.append(view)
|
|
647
|
+
|
|
648
|
+
if views_with_empty_definition:
|
|
649
|
+
updated_empty_views = self._maybe_populate_empty_view_definitions(
|
|
650
|
+
db_name, schema_name, views_with_empty_definition
|
|
651
|
+
)
|
|
652
|
+
views.extend(updated_empty_views)
|
|
653
|
+
|
|
451
654
|
return views
|
|
452
655
|
|
|
453
656
|
@serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
|
|
@@ -659,7 +862,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
659
862
|
def get_streams_for_database(
|
|
660
863
|
self, db_name: str
|
|
661
864
|
) -> Dict[str, List[SnowflakeStream]]:
|
|
662
|
-
page_limit =
|
|
865
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
663
866
|
|
|
664
867
|
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
665
868
|
|
|
@@ -714,3 +917,165 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
714
917
|
stream_pagination_marker = stream_name
|
|
715
918
|
|
|
716
919
|
return streams
|
|
920
|
+
|
|
921
|
+
@serialized_lru_cache(maxsize=1)
|
|
922
|
+
def get_procedures_for_database(
|
|
923
|
+
self, db_name: str
|
|
924
|
+
) -> Dict[str, List[BaseProcedure]]:
|
|
925
|
+
procedures: Dict[str, List[BaseProcedure]] = {}
|
|
926
|
+
cur = self.connection.query(
|
|
927
|
+
SnowflakeQuery.procedures_for_database(db_name),
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
for procedure in cur:
|
|
931
|
+
if procedure["PROCEDURE_SCHEMA"] not in procedures:
|
|
932
|
+
procedures[procedure["PROCEDURE_SCHEMA"]] = []
|
|
933
|
+
|
|
934
|
+
procedures[procedure["PROCEDURE_SCHEMA"]].append(
|
|
935
|
+
BaseProcedure(
|
|
936
|
+
name=procedure["PROCEDURE_NAME"],
|
|
937
|
+
language=procedure["PROCEDURE_LANGUAGE"],
|
|
938
|
+
argument_signature=procedure["ARGUMENT_SIGNATURE"],
|
|
939
|
+
return_type=procedure["PROCEDURE_RETURN_TYPE"],
|
|
940
|
+
procedure_definition=procedure["PROCEDURE_DEFINITION"],
|
|
941
|
+
created=procedure["CREATED"],
|
|
942
|
+
last_altered=procedure["LAST_ALTERED"],
|
|
943
|
+
comment=procedure["COMMENT"],
|
|
944
|
+
extra_properties=None,
|
|
945
|
+
)
|
|
946
|
+
)
|
|
947
|
+
return procedures
|
|
948
|
+
|
|
949
|
+
@serialized_lru_cache(maxsize=1)
|
|
950
|
+
def get_dynamic_table_graph_info(self, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
951
|
+
"""Get dynamic table dependency information from information schema."""
|
|
952
|
+
dt_graph_info: Dict[str, Dict[str, Any]] = {}
|
|
953
|
+
try:
|
|
954
|
+
cur = self.connection.query(
|
|
955
|
+
SnowflakeQuery.get_dynamic_table_graph_history(db_name)
|
|
956
|
+
)
|
|
957
|
+
for row in cur:
|
|
958
|
+
dt_name = row["NAME"]
|
|
959
|
+
dt_graph_info[dt_name] = {
|
|
960
|
+
"inputs": row.get("INPUTS"),
|
|
961
|
+
"target_lag_type": row.get("TARGET_LAG_TYPE"),
|
|
962
|
+
"target_lag_sec": row.get("TARGET_LAG_SEC"),
|
|
963
|
+
"scheduling_state": row.get("SCHEDULING_STATE"),
|
|
964
|
+
"alter_trigger": row.get("ALTER_TRIGGER"),
|
|
965
|
+
}
|
|
966
|
+
logger.debug(
|
|
967
|
+
f"Successfully retrieved graph info for {len(dt_graph_info)} dynamic tables in {db_name}"
|
|
968
|
+
)
|
|
969
|
+
except Exception as e:
|
|
970
|
+
self.report.warning(
|
|
971
|
+
"Failed to get dynamic table graph history",
|
|
972
|
+
db_name,
|
|
973
|
+
exc=e,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
return dt_graph_info
|
|
977
|
+
|
|
978
|
+
@serialized_lru_cache(maxsize=1)
|
|
979
|
+
def get_dynamic_tables_with_definitions(
|
|
980
|
+
self, db_name: str
|
|
981
|
+
) -> Dict[str, List[SnowflakeDynamicTable]]:
|
|
982
|
+
"""Get dynamic tables with their definitions using SHOW DYNAMIC TABLES."""
|
|
983
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
984
|
+
dynamic_tables: Dict[str, List[SnowflakeDynamicTable]] = {}
|
|
985
|
+
|
|
986
|
+
# Get graph/dependency information (pass db_name)
|
|
987
|
+
dt_graph_info = self.get_dynamic_table_graph_info(db_name)
|
|
988
|
+
|
|
989
|
+
first_iteration = True
|
|
990
|
+
dt_pagination_marker: Optional[str] = None
|
|
991
|
+
|
|
992
|
+
while first_iteration or dt_pagination_marker is not None:
|
|
993
|
+
try:
|
|
994
|
+
cur = self.connection.query(
|
|
995
|
+
SnowflakeQuery.show_dynamic_tables_for_database(
|
|
996
|
+
db_name,
|
|
997
|
+
limit=page_limit,
|
|
998
|
+
dynamic_table_pagination_marker=dt_pagination_marker,
|
|
999
|
+
)
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
first_iteration = False
|
|
1003
|
+
dt_pagination_marker = None
|
|
1004
|
+
result_set_size = 0
|
|
1005
|
+
|
|
1006
|
+
for dt in cur:
|
|
1007
|
+
result_set_size += 1
|
|
1008
|
+
|
|
1009
|
+
dt_name = dt["name"]
|
|
1010
|
+
schema_name = dt["schema_name"]
|
|
1011
|
+
|
|
1012
|
+
if schema_name not in dynamic_tables:
|
|
1013
|
+
dynamic_tables[schema_name] = []
|
|
1014
|
+
|
|
1015
|
+
# Get definition from SHOW result
|
|
1016
|
+
definition = dt.get("text")
|
|
1017
|
+
|
|
1018
|
+
# Get target lag from SHOW result or graph info
|
|
1019
|
+
target_lag = dt.get("target_lag")
|
|
1020
|
+
if not target_lag and dt_graph_info:
|
|
1021
|
+
qualified_name = f"{db_name}.{schema_name}.{dt_name}"
|
|
1022
|
+
graph_info = dt_graph_info.get(qualified_name, {})
|
|
1023
|
+
if graph_info.get("target_lag_type") and graph_info.get(
|
|
1024
|
+
"target_lag_sec"
|
|
1025
|
+
):
|
|
1026
|
+
target_lag = f"{graph_info['target_lag_sec']} {graph_info['target_lag_type']}"
|
|
1027
|
+
|
|
1028
|
+
dynamic_tables[schema_name].append(
|
|
1029
|
+
SnowflakeDynamicTable(
|
|
1030
|
+
name=dt_name,
|
|
1031
|
+
created=dt["created_on"],
|
|
1032
|
+
last_altered=dt.get("created_on"),
|
|
1033
|
+
size_in_bytes=dt.get("bytes", 0),
|
|
1034
|
+
rows_count=dt.get("rows", 0),
|
|
1035
|
+
comment=dt.get("comment"),
|
|
1036
|
+
definition=definition,
|
|
1037
|
+
target_lag=target_lag,
|
|
1038
|
+
is_dynamic=True,
|
|
1039
|
+
type="DYNAMIC TABLE",
|
|
1040
|
+
)
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
if result_set_size >= page_limit:
|
|
1044
|
+
logger.info(
|
|
1045
|
+
f"Fetching next page of dynamic tables for {db_name} - after {dt_name}"
|
|
1046
|
+
)
|
|
1047
|
+
dt_pagination_marker = dt_name
|
|
1048
|
+
|
|
1049
|
+
except Exception as e:
|
|
1050
|
+
logger.debug(
|
|
1051
|
+
f"Failed to get dynamic tables for database {db_name}: {e}"
|
|
1052
|
+
)
|
|
1053
|
+
break
|
|
1054
|
+
|
|
1055
|
+
return dynamic_tables
|
|
1056
|
+
|
|
1057
|
+
def populate_dynamic_table_definitions(
|
|
1058
|
+
self, tables: Dict[str, List[SnowflakeTable]], db_name: str
|
|
1059
|
+
) -> None:
|
|
1060
|
+
"""Populate dynamic table definitions for tables that are marked as dynamic."""
|
|
1061
|
+
try:
|
|
1062
|
+
# Get dynamic tables with definitions from SHOW command
|
|
1063
|
+
dt_with_definitions = self.get_dynamic_tables_with_definitions(db_name)
|
|
1064
|
+
|
|
1065
|
+
for schema_name, table_list in tables.items():
|
|
1066
|
+
for table in table_list:
|
|
1067
|
+
if (
|
|
1068
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
1069
|
+
and table.definition is None
|
|
1070
|
+
):
|
|
1071
|
+
# Find matching dynamic table from SHOW results
|
|
1072
|
+
show_dt_list = dt_with_definitions.get(schema_name, [])
|
|
1073
|
+
for show_dt in show_dt_list:
|
|
1074
|
+
if show_dt.name == table.name:
|
|
1075
|
+
table.definition = show_dt.definition
|
|
1076
|
+
table.target_lag = show_dt.target_lag
|
|
1077
|
+
break
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
logger.debug(
|
|
1080
|
+
f"Failed to populate dynamic table definitions for {db_name}: {e}"
|
|
1081
|
+
)
|