acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -41,9 +41,11 @@ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
|
|
|
41
41
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
42
42
|
from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
43
43
|
SCHEMA_PARALLELISM,
|
|
44
|
+
BaseProcedure,
|
|
44
45
|
SnowflakeColumn,
|
|
45
46
|
SnowflakeDatabase,
|
|
46
47
|
SnowflakeDataDictionary,
|
|
48
|
+
SnowflakeDynamicTable,
|
|
47
49
|
SnowflakeFK,
|
|
48
50
|
SnowflakePK,
|
|
49
51
|
SnowflakeSchema,
|
|
@@ -63,17 +65,19 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
63
65
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
64
66
|
add_table_to_schema_container,
|
|
65
67
|
gen_database_container,
|
|
66
|
-
gen_database_key,
|
|
67
68
|
gen_schema_container,
|
|
68
|
-
gen_schema_key,
|
|
69
69
|
get_dataplatform_instance_aspect,
|
|
70
70
|
get_domain_wu,
|
|
71
71
|
)
|
|
72
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
73
|
+
generate_procedure_container_workunits,
|
|
74
|
+
generate_procedure_workunits,
|
|
75
|
+
)
|
|
72
76
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
73
77
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
74
78
|
LINEAGE_EXTRACTION,
|
|
75
79
|
METADATA_EXTRACTION,
|
|
76
|
-
|
|
80
|
+
IngestionHighStage,
|
|
77
81
|
)
|
|
78
82
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
79
83
|
GlobalTags,
|
|
@@ -162,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
162
166
|
|
|
163
167
|
def __init__(
|
|
164
168
|
self,
|
|
165
|
-
config: SnowflakeV2Config,
|
|
166
|
-
report: SnowflakeV2Report,
|
|
169
|
+
config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
|
|
170
|
+
report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
|
|
167
171
|
connection: SnowflakeConnection,
|
|
168
172
|
filters: SnowflakeFilter,
|
|
169
173
|
identifiers: SnowflakeIdentifierBuilder,
|
|
@@ -171,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
171
175
|
profiler: Optional[SnowflakeProfiler],
|
|
172
176
|
aggregator: Optional[SqlParsingAggregator],
|
|
173
177
|
snowsight_url_builder: Optional[SnowsightUrlBuilder],
|
|
178
|
+
fetch_views_from_information_schema: bool = False,
|
|
174
179
|
) -> None:
|
|
175
180
|
self.config: SnowflakeV2Config = config
|
|
176
181
|
self.report: SnowflakeV2Report = report
|
|
@@ -179,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
179
184
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
180
185
|
|
|
181
186
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
182
|
-
connection=self.connection
|
|
187
|
+
connection=self.connection,
|
|
188
|
+
report=self.report,
|
|
189
|
+
fetch_views_from_information_schema=fetch_views_from_information_schema,
|
|
183
190
|
)
|
|
184
191
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
185
192
|
|
|
@@ -353,7 +360,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
353
360
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
354
361
|
|
|
355
362
|
if self.profiler and db_tables:
|
|
356
|
-
with self.report.
|
|
363
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
357
364
|
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
358
365
|
|
|
359
366
|
def _process_db_schemas(
|
|
@@ -434,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
434
441
|
tables = self.fetch_tables_for_schema(
|
|
435
442
|
snowflake_schema, db_name, schema_name
|
|
436
443
|
)
|
|
444
|
+
if self.config.include_views:
|
|
445
|
+
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
446
|
+
|
|
447
|
+
if self.config.include_tables:
|
|
437
448
|
db_tables[schema_name] = tables
|
|
438
449
|
yield from self._process_tables(
|
|
439
450
|
tables, snowflake_schema, db_name, schema_name
|
|
440
451
|
)
|
|
441
452
|
|
|
442
453
|
if self.config.include_views:
|
|
443
|
-
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
444
454
|
yield from self._process_views(
|
|
445
455
|
views, snowflake_schema, db_name, schema_name
|
|
446
456
|
)
|
|
@@ -448,10 +458,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
448
458
|
if self.config.include_streams:
|
|
449
459
|
self.report.num_get_streams_for_schema_queries += 1
|
|
450
460
|
streams = self.fetch_streams_for_schema(
|
|
451
|
-
snowflake_schema,
|
|
461
|
+
snowflake_schema,
|
|
462
|
+
db_name,
|
|
452
463
|
)
|
|
453
464
|
yield from self._process_streams(streams, snowflake_schema, db_name)
|
|
454
465
|
|
|
466
|
+
if self.config.include_procedures:
|
|
467
|
+
procedures = self.fetch_procedures_for_schema(snowflake_schema, db_name)
|
|
468
|
+
yield from self._process_procedures(procedures, snowflake_schema, db_name)
|
|
469
|
+
|
|
455
470
|
if self.config.include_technical_schema and snowflake_schema.tags:
|
|
456
471
|
yield from self._process_tags_in_schema(snowflake_schema)
|
|
457
472
|
|
|
@@ -487,6 +502,22 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
487
502
|
if self.config.include_technical_schema:
|
|
488
503
|
data_reader = self.make_data_reader()
|
|
489
504
|
for table in tables:
|
|
505
|
+
# Handle dynamic table definitions for lineage
|
|
506
|
+
if (
|
|
507
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
508
|
+
and table.definition
|
|
509
|
+
and self.aggregator
|
|
510
|
+
):
|
|
511
|
+
table_identifier = self.identifiers.get_dataset_identifier(
|
|
512
|
+
table.name, schema_name, db_name
|
|
513
|
+
)
|
|
514
|
+
self.aggregator.add_view_definition(
|
|
515
|
+
view_urn=self.identifiers.gen_dataset_urn(table_identifier),
|
|
516
|
+
view_definition=table.definition,
|
|
517
|
+
default_db=db_name,
|
|
518
|
+
default_schema=schema_name,
|
|
519
|
+
)
|
|
520
|
+
|
|
490
521
|
table_wu_generator = self._process_table(
|
|
491
522
|
table, snowflake_schema, db_name
|
|
492
523
|
)
|
|
@@ -536,6 +567,26 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
536
567
|
for stream in streams:
|
|
537
568
|
yield from self._process_stream(stream, snowflake_schema, db_name)
|
|
538
569
|
|
|
570
|
+
def _process_procedures(
|
|
571
|
+
self,
|
|
572
|
+
procedures: List[BaseProcedure],
|
|
573
|
+
snowflake_schema: SnowflakeSchema,
|
|
574
|
+
db_name: str,
|
|
575
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
576
|
+
if self.config.include_technical_schema:
|
|
577
|
+
if procedures:
|
|
578
|
+
yield from generate_procedure_container_workunits(
|
|
579
|
+
self.identifiers.gen_database_key(
|
|
580
|
+
db_name,
|
|
581
|
+
),
|
|
582
|
+
self.identifiers.gen_schema_key(
|
|
583
|
+
db_name=db_name,
|
|
584
|
+
schema_name=snowflake_schema.name,
|
|
585
|
+
),
|
|
586
|
+
)
|
|
587
|
+
for procedure in procedures:
|
|
588
|
+
yield from self._process_procedure(procedure, snowflake_schema, db_name)
|
|
589
|
+
|
|
539
590
|
def _process_tags_in_schema(
|
|
540
591
|
self, snowflake_schema: SnowflakeSchema
|
|
541
592
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -819,13 +870,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
819
870
|
entityUrn=dataset_urn, aspect=dataset_properties
|
|
820
871
|
).as_workunit()
|
|
821
872
|
|
|
822
|
-
schema_container_key = gen_schema_key(
|
|
823
|
-
db_name=self.snowflake_identifier(db_name),
|
|
824
|
-
schema=self.snowflake_identifier(schema_name),
|
|
825
|
-
platform=self.platform,
|
|
826
|
-
platform_instance=self.config.platform_instance,
|
|
827
|
-
env=self.config.env,
|
|
828
|
-
)
|
|
873
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema_name)
|
|
829
874
|
|
|
830
875
|
if self.config.extract_tags_as_structured_properties:
|
|
831
876
|
yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
|
|
@@ -913,6 +958,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
913
958
|
}
|
|
914
959
|
)
|
|
915
960
|
|
|
961
|
+
if isinstance(table, SnowflakeDynamicTable):
|
|
962
|
+
if table.target_lag:
|
|
963
|
+
custom_properties["TARGET_LAG"] = table.target_lag
|
|
964
|
+
|
|
916
965
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
917
966
|
custom_properties["IS_SECURE"] = "true"
|
|
918
967
|
|
|
@@ -958,7 +1007,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
958
1007
|
schema_name,
|
|
959
1008
|
db_name,
|
|
960
1009
|
(
|
|
961
|
-
SnowflakeObjectDomain.
|
|
1010
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
1011
|
+
if isinstance(table, SnowflakeTable) and table.is_dynamic
|
|
1012
|
+
else SnowflakeObjectDomain.TABLE
|
|
962
1013
|
if isinstance(table, SnowflakeTable)
|
|
963
1014
|
else SnowflakeObjectDomain.VIEW
|
|
964
1015
|
),
|
|
@@ -1094,11 +1145,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1094
1145
|
def gen_database_containers(
|
|
1095
1146
|
self, database: SnowflakeDatabase
|
|
1096
1147
|
) -> Iterable[MetadataWorkUnit]:
|
|
1097
|
-
database_container_key = gen_database_key(
|
|
1098
|
-
|
|
1099
|
-
platform=self.platform,
|
|
1100
|
-
platform_instance=self.config.platform_instance,
|
|
1101
|
-
env=self.config.env,
|
|
1148
|
+
database_container_key = self.identifiers.gen_database_key(
|
|
1149
|
+
database.name,
|
|
1102
1150
|
)
|
|
1103
1151
|
|
|
1104
1152
|
yield from gen_database_container(
|
|
@@ -1147,21 +1195,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1147
1195
|
def gen_schema_containers(
|
|
1148
1196
|
self, schema: SnowflakeSchema, db_name: str
|
|
1149
1197
|
) -> Iterable[MetadataWorkUnit]:
|
|
1150
|
-
|
|
1151
|
-
database_container_key = gen_database_key(
|
|
1152
|
-
database=self.snowflake_identifier(db_name),
|
|
1153
|
-
platform=self.platform,
|
|
1154
|
-
platform_instance=self.config.platform_instance,
|
|
1155
|
-
env=self.config.env,
|
|
1156
|
-
)
|
|
1198
|
+
database_container_key = self.identifiers.gen_database_key(db_name)
|
|
1157
1199
|
|
|
1158
|
-
schema_container_key = gen_schema_key(
|
|
1159
|
-
db_name=self.snowflake_identifier(db_name),
|
|
1160
|
-
schema=schema_name,
|
|
1161
|
-
platform=self.platform,
|
|
1162
|
-
platform_instance=self.config.platform_instance,
|
|
1163
|
-
env=self.config.env,
|
|
1164
|
-
)
|
|
1200
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema.name)
|
|
1165
1201
|
|
|
1166
1202
|
yield from gen_schema_container(
|
|
1167
1203
|
name=schema.name,
|
|
@@ -1211,7 +1247,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1211
1247
|
# falling back to get tables for schema
|
|
1212
1248
|
if tables is None:
|
|
1213
1249
|
self.report.num_get_tables_for_schema_queries += 1
|
|
1214
|
-
return self.data_dictionary.get_tables_for_schema(
|
|
1250
|
+
return self.data_dictionary.get_tables_for_schema(
|
|
1251
|
+
db_name=db_name,
|
|
1252
|
+
schema_name=schema_name,
|
|
1253
|
+
)
|
|
1215
1254
|
|
|
1216
1255
|
# Some schema may not have any table
|
|
1217
1256
|
return tables.get(schema_name, [])
|
|
@@ -1221,8 +1260,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1221
1260
|
) -> List[SnowflakeView]:
|
|
1222
1261
|
views = self.data_dictionary.get_views_for_database(db_name)
|
|
1223
1262
|
|
|
1224
|
-
|
|
1225
|
-
|
|
1263
|
+
if views is not None:
|
|
1264
|
+
# Some schemas may not have any views
|
|
1265
|
+
return views.get(schema_name, [])
|
|
1266
|
+
|
|
1267
|
+
# Usually this fails when there are too many views in the schema.
|
|
1268
|
+
# Fall back to per-schema queries.
|
|
1269
|
+
self.report.num_get_views_for_schema_queries += 1
|
|
1270
|
+
return self.data_dictionary.get_views_for_schema_using_information_schema(
|
|
1271
|
+
db_name=db_name,
|
|
1272
|
+
schema_name=schema_name,
|
|
1273
|
+
)
|
|
1226
1274
|
|
|
1227
1275
|
def get_columns_for_table(
|
|
1228
1276
|
self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
|
|
@@ -1290,13 +1338,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1290
1338
|
)
|
|
1291
1339
|
|
|
1292
1340
|
def fetch_streams_for_schema(
|
|
1293
|
-
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1341
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1294
1342
|
) -> List[SnowflakeStream]:
|
|
1295
1343
|
try:
|
|
1296
1344
|
streams: List[SnowflakeStream] = []
|
|
1297
|
-
for stream in self.get_streams_for_schema(
|
|
1345
|
+
for stream in self.get_streams_for_schema(snowflake_schema.name, db_name):
|
|
1298
1346
|
stream_identifier = self.identifiers.get_dataset_identifier(
|
|
1299
|
-
stream.name,
|
|
1347
|
+
stream.name, snowflake_schema.name, db_name
|
|
1300
1348
|
)
|
|
1301
1349
|
|
|
1302
1350
|
self.report.report_entity_scanned(stream_identifier, "stream")
|
|
@@ -1310,16 +1358,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1310
1358
|
snowflake_schema.streams = [stream.name for stream in streams]
|
|
1311
1359
|
return streams
|
|
1312
1360
|
except Exception as e:
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
return []
|
|
1361
|
+
self.structured_reporter.warning(
|
|
1362
|
+
title="Failed to get streams for schema",
|
|
1363
|
+
message="Please check permissions"
|
|
1364
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1365
|
+
else "",
|
|
1366
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1367
|
+
exc=e,
|
|
1368
|
+
)
|
|
1369
|
+
return []
|
|
1323
1370
|
|
|
1324
1371
|
def get_streams_for_schema(
|
|
1325
1372
|
self, schema_name: str, db_name: str
|
|
@@ -1328,6 +1375,42 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1328
1375
|
|
|
1329
1376
|
return streams.get(schema_name, [])
|
|
1330
1377
|
|
|
1378
|
+
def fetch_procedures_for_schema(
|
|
1379
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1380
|
+
) -> List[BaseProcedure]:
|
|
1381
|
+
try:
|
|
1382
|
+
procedures: List[BaseProcedure] = []
|
|
1383
|
+
for procedure in self.get_procedures_for_schema(snowflake_schema, db_name):
|
|
1384
|
+
procedure_qualified_name = self.identifiers.get_dataset_identifier(
|
|
1385
|
+
procedure.name, snowflake_schema.name, db_name
|
|
1386
|
+
)
|
|
1387
|
+
self.report.report_entity_scanned(procedure_qualified_name, "procedure")
|
|
1388
|
+
|
|
1389
|
+
if self.filters.is_procedure_allowed(procedure_qualified_name):
|
|
1390
|
+
procedures.append(procedure)
|
|
1391
|
+
else:
|
|
1392
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1393
|
+
return procedures
|
|
1394
|
+
except Exception as e:
|
|
1395
|
+
self.structured_reporter.warning(
|
|
1396
|
+
title="Failed to get procedures for schema",
|
|
1397
|
+
message="Please check permissions"
|
|
1398
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1399
|
+
else "",
|
|
1400
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1401
|
+
exc=e,
|
|
1402
|
+
)
|
|
1403
|
+
return []
|
|
1404
|
+
|
|
1405
|
+
def get_procedures_for_schema(
|
|
1406
|
+
self,
|
|
1407
|
+
snowflake_schema: SnowflakeSchema,
|
|
1408
|
+
db_name: str,
|
|
1409
|
+
) -> List[BaseProcedure]:
|
|
1410
|
+
procedures = self.data_dictionary.get_procedures_for_database(db_name)
|
|
1411
|
+
|
|
1412
|
+
return procedures.get(snowflake_schema.name, [])
|
|
1413
|
+
|
|
1331
1414
|
def _process_stream(
|
|
1332
1415
|
self,
|
|
1333
1416
|
stream: SnowflakeStream,
|
|
@@ -1350,6 +1433,34 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1350
1433
|
"Failed to get columns for stream:", stream.name, exc=e
|
|
1351
1434
|
)
|
|
1352
1435
|
|
|
1436
|
+
def _process_procedure(
|
|
1437
|
+
self,
|
|
1438
|
+
procedure: BaseProcedure,
|
|
1439
|
+
snowflake_schema: SnowflakeSchema,
|
|
1440
|
+
db_name: str,
|
|
1441
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1442
|
+
try:
|
|
1443
|
+
# TODO: For CLL, we should process procedures after all tables are processed
|
|
1444
|
+
yield from generate_procedure_workunits(
|
|
1445
|
+
procedure,
|
|
1446
|
+
database_key=self.identifiers.gen_database_key(
|
|
1447
|
+
db_name,
|
|
1448
|
+
),
|
|
1449
|
+
schema_key=self.identifiers.gen_schema_key(
|
|
1450
|
+
db_name, snowflake_schema.name
|
|
1451
|
+
),
|
|
1452
|
+
schema_resolver=(
|
|
1453
|
+
self.aggregator._schema_resolver if self.aggregator else None
|
|
1454
|
+
),
|
|
1455
|
+
)
|
|
1456
|
+
except Exception as e:
|
|
1457
|
+
self.structured_reporter.warning(
|
|
1458
|
+
title="Failed to ingest stored procedure",
|
|
1459
|
+
message="",
|
|
1460
|
+
context=procedure.name,
|
|
1461
|
+
exc=e,
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1353
1464
|
def get_columns_for_stream(
|
|
1354
1465
|
self,
|
|
1355
1466
|
source_object: str, # Qualified name of source table/view
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import (
|
|
|
20
20
|
SnowflakeSchemaGenerator,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
23
|
+
SnowflakeFilter,
|
|
23
24
|
SnowflakeIdentifierBuilder,
|
|
24
25
|
)
|
|
25
26
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
@@ -58,7 +59,7 @@ class SnowflakeSummaryReport(SourceReport, BaseTimeWindowReport):
|
|
|
58
59
|
|
|
59
60
|
|
|
60
61
|
@config_class(SnowflakeSummaryConfig)
|
|
61
|
-
@support_status(SupportStatus.
|
|
62
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
62
63
|
class SnowflakeSummarySource(Source):
|
|
63
64
|
def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig):
|
|
64
65
|
super().__init__(ctx)
|
|
@@ -81,6 +82,11 @@ class SnowflakeSummarySource(Source):
|
|
|
81
82
|
profiler=None,
|
|
82
83
|
aggregator=None,
|
|
83
84
|
snowsight_url_builder=None,
|
|
85
|
+
filters=SnowflakeFilter(
|
|
86
|
+
filter_config=self.config,
|
|
87
|
+
structured_reporter=self.report,
|
|
88
|
+
),
|
|
89
|
+
fetch_views_from_information_schema=False, # we haven't enabled this config for SnowflakeSummarySource
|
|
84
90
|
)
|
|
85
91
|
|
|
86
92
|
# Databases.
|
|
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
|
23
23
|
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
24
24
|
StructuredPropertyDefinition,
|
|
25
25
|
)
|
|
26
|
+
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
26
27
|
from datahub.metadata.urns import (
|
|
27
28
|
ContainerUrn,
|
|
28
29
|
DatasetUrn,
|
|
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
81
82
|
def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
|
|
82
83
|
for tag in self.data_dictionary.get_all_tags():
|
|
83
84
|
if not self.config.structured_property_pattern.allowed(
|
|
84
|
-
tag.
|
|
85
|
+
tag._id_prefix_as_str()
|
|
85
86
|
):
|
|
86
87
|
continue
|
|
87
88
|
if self.config.extract_tags_as_structured_properties:
|
|
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
111
112
|
yield MetadataChangeProposalWrapper(
|
|
112
113
|
entityUrn=urn,
|
|
113
114
|
aspect=aspect,
|
|
115
|
+
changeType=ChangeTypeClass.CREATE,
|
|
116
|
+
headers={"If-None-Match": "*"},
|
|
114
117
|
).as_workunit()
|
|
115
118
|
|
|
116
119
|
def _get_tags_on_object_with_propagation(
|
|
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
231
231
|
|
|
232
232
|
with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
|
|
233
233
|
for row in results:
|
|
234
|
-
with
|
|
234
|
+
with (
|
|
235
|
+
fetch_timer.pause(),
|
|
236
|
+
self.report.usage_aggregation.result_skip_timer as skip_timer,
|
|
237
|
+
):
|
|
235
238
|
if results.rownumber is not None and results.rownumber % 1000 == 0:
|
|
236
239
|
logger.debug(f"Processing usage row number {results.rownumber}")
|
|
237
240
|
logger.debug(self.report.usage_aggregation.as_string())
|
|
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
255
258
|
f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
|
|
256
259
|
)
|
|
257
260
|
continue
|
|
258
|
-
with
|
|
261
|
+
with (
|
|
262
|
+
skip_timer.pause(),
|
|
263
|
+
self.report.usage_aggregation.result_map_timer as map_timer,
|
|
264
|
+
):
|
|
259
265
|
wu = self.build_usage_statistics_for_dataset(
|
|
260
266
|
dataset_identifier, row
|
|
261
267
|
)
|
|
@@ -3,9 +3,13 @@ from functools import cached_property
|
|
|
3
3
|
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
|
-
from datahub.emitter.mce_builder import
|
|
6
|
+
from datahub.emitter.mce_builder import (
|
|
7
|
+
make_dataset_urn_with_platform_instance,
|
|
8
|
+
)
|
|
9
|
+
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
7
10
|
from datahub.ingestion.api.source import SourceReport
|
|
8
11
|
from datahub.ingestion.source.snowflake.constants import (
|
|
12
|
+
DEFAULT_SNOWFLAKE_DOMAIN,
|
|
9
13
|
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
|
|
10
14
|
SnowflakeCloudProvider,
|
|
11
15
|
SnowflakeObjectDomain,
|
|
@@ -16,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
16
20
|
SnowflakeV2Config,
|
|
17
21
|
)
|
|
18
22
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
23
|
+
from datahub.ingestion.source.sql.sql_utils import gen_database_key, gen_schema_key
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
@@ -30,16 +35,21 @@ class SnowsightUrlBuilder:
|
|
|
30
35
|
"us-east-1",
|
|
31
36
|
"eu-west-1",
|
|
32
37
|
"eu-central-1",
|
|
33
|
-
"ap-southeast-1",
|
|
34
38
|
"ap-southeast-2",
|
|
35
39
|
]
|
|
36
40
|
|
|
37
41
|
snowsight_base_url: str
|
|
38
42
|
|
|
39
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
account_locator: str,
|
|
46
|
+
region: str,
|
|
47
|
+
privatelink: bool = False,
|
|
48
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
49
|
+
):
|
|
40
50
|
cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region)
|
|
41
51
|
self.snowsight_base_url = self.create_snowsight_base_url(
|
|
42
|
-
account_locator, cloud_region_id, cloud, privatelink
|
|
52
|
+
account_locator, cloud_region_id, cloud, privatelink, snowflake_domain
|
|
43
53
|
)
|
|
44
54
|
|
|
45
55
|
@staticmethod
|
|
@@ -48,6 +58,7 @@ class SnowsightUrlBuilder:
|
|
|
48
58
|
cloud_region_id: str,
|
|
49
59
|
cloud: str,
|
|
50
60
|
privatelink: bool = False,
|
|
61
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
51
62
|
) -> str:
|
|
52
63
|
if cloud:
|
|
53
64
|
url_cloud_provider_suffix = f".{cloud}"
|
|
@@ -62,8 +73,14 @@ class SnowsightUrlBuilder:
|
|
|
62
73
|
url_cloud_provider_suffix = ""
|
|
63
74
|
else:
|
|
64
75
|
url_cloud_provider_suffix = f".{cloud}"
|
|
65
|
-
|
|
66
|
-
|
|
76
|
+
# Note: Snowsight is always accessed via the public internet (app.snowflake.com)
|
|
77
|
+
# even for accounts using privatelink. Privatelink only applies to database connections,
|
|
78
|
+
# not the Snowsight web UI.
|
|
79
|
+
# Standard Snowsight URL format - works for most regions
|
|
80
|
+
# China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
|
|
81
|
+
# guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
|
|
82
|
+
if snowflake_domain == "snowflakecomputing.cn":
|
|
83
|
+
url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
67
84
|
else:
|
|
68
85
|
url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
69
86
|
return url
|
|
@@ -73,7 +90,7 @@ class SnowsightUrlBuilder:
|
|
|
73
90
|
region: str,
|
|
74
91
|
) -> Tuple[str, str]:
|
|
75
92
|
cloud: str
|
|
76
|
-
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING
|
|
93
|
+
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
|
|
77
94
|
cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
|
|
78
95
|
elif region.startswith(("aws_", "gcp_", "azure_")):
|
|
79
96
|
# e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
|
|
@@ -89,9 +106,20 @@ class SnowsightUrlBuilder:
|
|
|
89
106
|
table_name: str,
|
|
90
107
|
schema_name: str,
|
|
91
108
|
db_name: str,
|
|
92
|
-
domain: Literal[
|
|
109
|
+
domain: Literal[
|
|
110
|
+
SnowflakeObjectDomain.TABLE,
|
|
111
|
+
SnowflakeObjectDomain.VIEW,
|
|
112
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
113
|
+
],
|
|
93
114
|
) -> Optional[str]:
|
|
94
|
-
|
|
115
|
+
# For dynamic tables, use the dynamic-table domain in the URL path
|
|
116
|
+
# Ensure only explicitly dynamic tables use dynamic-table URL path
|
|
117
|
+
url_domain = (
|
|
118
|
+
"dynamic-table"
|
|
119
|
+
if domain == SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
120
|
+
else str(domain)
|
|
121
|
+
)
|
|
122
|
+
return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{url_domain}/{table_name}/"
|
|
95
123
|
|
|
96
124
|
def get_external_url_for_schema(
|
|
97
125
|
self, schema_name: str, db_name: str
|
|
@@ -125,6 +153,7 @@ class SnowflakeFilter:
|
|
|
125
153
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
126
154
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
127
155
|
SnowflakeObjectDomain.STREAM,
|
|
156
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
128
157
|
):
|
|
129
158
|
return False
|
|
130
159
|
if _is_sys_table(dataset_name):
|
|
@@ -156,7 +185,8 @@ class SnowflakeFilter:
|
|
|
156
185
|
return False
|
|
157
186
|
|
|
158
187
|
if dataset_type.lower() in {
|
|
159
|
-
SnowflakeObjectDomain.TABLE
|
|
188
|
+
SnowflakeObjectDomain.TABLE,
|
|
189
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
160
190
|
} and not self.filter_config.table_pattern.allowed(
|
|
161
191
|
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
162
192
|
):
|
|
@@ -180,6 +210,9 @@ class SnowflakeFilter:
|
|
|
180
210
|
|
|
181
211
|
return True
|
|
182
212
|
|
|
213
|
+
def is_procedure_allowed(self, procedure_name: str) -> bool:
|
|
214
|
+
return self.filter_config.procedure_pattern.allowed(procedure_name)
|
|
215
|
+
|
|
183
216
|
|
|
184
217
|
def _combine_identifier_parts(
|
|
185
218
|
*, table_name: str, schema_name: str, db_name: str
|
|
@@ -318,18 +351,30 @@ class SnowflakeIdentifierBuilder:
|
|
|
318
351
|
user_email: Optional[str],
|
|
319
352
|
) -> str:
|
|
320
353
|
if user_email:
|
|
321
|
-
return self.snowflake_identifier(
|
|
322
|
-
user_email
|
|
323
|
-
if self.identifier_config.email_as_user_identifier is True
|
|
324
|
-
else user_email.split("@")[0]
|
|
325
|
-
)
|
|
354
|
+
return self.snowflake_identifier(user_email)
|
|
326
355
|
return self.snowflake_identifier(
|
|
327
356
|
f"{user_name}@{self.identifier_config.email_domain}"
|
|
328
|
-
if self.identifier_config.
|
|
329
|
-
and self.identifier_config.email_domain is not None
|
|
357
|
+
if self.identifier_config.email_domain is not None
|
|
330
358
|
else user_name
|
|
331
359
|
)
|
|
332
360
|
|
|
361
|
+
def gen_schema_key(self, db_name: str, schema_name: str) -> SchemaKey:
|
|
362
|
+
return gen_schema_key(
|
|
363
|
+
db_name=self.snowflake_identifier(db_name),
|
|
364
|
+
schema=self.snowflake_identifier(schema_name),
|
|
365
|
+
platform=self.platform,
|
|
366
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
367
|
+
env=self.identifier_config.env,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def gen_database_key(self, db_name: str) -> DatabaseKey:
|
|
371
|
+
return gen_database_key(
|
|
372
|
+
database=self.snowflake_identifier(db_name),
|
|
373
|
+
platform=self.platform,
|
|
374
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
375
|
+
env=self.identifier_config.env,
|
|
376
|
+
)
|
|
377
|
+
|
|
333
378
|
|
|
334
379
|
class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
335
380
|
platform = "snowflake"
|