acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,8 @@ class SnowflakeCloudProvider(StrEnum):
|
|
|
9
9
|
|
|
10
10
|
SNOWFLAKE_DEFAULT_CLOUD = SnowflakeCloudProvider.AWS
|
|
11
11
|
|
|
12
|
+
DEFAULT_SNOWFLAKE_DOMAIN = "snowflakecomputing.com"
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class SnowflakeEdition(StrEnum):
|
|
14
16
|
STANDARD = "Standard"
|
|
@@ -54,6 +56,8 @@ class SnowflakeObjectDomain(StrEnum):
|
|
|
54
56
|
COLUMN = "column"
|
|
55
57
|
ICEBERG_TABLE = "iceberg table"
|
|
56
58
|
STREAM = "stream"
|
|
59
|
+
PROCEDURE = "procedure"
|
|
60
|
+
DYNAMIC_TABLE = "dynamic table"
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
GENERIC_PERMISSION_ERROR_KEY = "permission-error"
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Dict, List, Optional, Set
|
|
5
6
|
|
|
6
7
|
import pydantic
|
|
7
|
-
from pydantic import Field,
|
|
8
|
+
from pydantic import Field, root_validator, validator
|
|
8
9
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
11
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
11
12
|
from datahub.configuration.source_common import (
|
|
12
13
|
EnvConfigMixin,
|
|
@@ -22,6 +23,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
|
|
|
22
23
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
23
24
|
ClassificationSourceConfigMixin,
|
|
24
25
|
)
|
|
26
|
+
from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
|
|
25
27
|
from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
26
28
|
SnowflakeConnectionConfig,
|
|
27
29
|
)
|
|
@@ -29,6 +31,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
|
|
|
29
31
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
30
32
|
StatefulLineageConfigMixin,
|
|
31
33
|
StatefulProfilingConfigMixin,
|
|
34
|
+
StatefulTimeWindowConfigMixin,
|
|
32
35
|
StatefulUsageConfigMixin,
|
|
33
36
|
)
|
|
34
37
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
@@ -48,9 +51,15 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
|
48
51
|
rf".*\.SEGMENT_{UUID_REGEX}", # segment
|
|
49
52
|
rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
|
|
50
53
|
r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
|
|
54
|
+
r".*\.SNOWPARK_TEMP_TABLE_.+", # snowpark
|
|
51
55
|
]
|
|
52
56
|
|
|
53
57
|
|
|
58
|
+
class QueryDedupStrategyType(Enum):
|
|
59
|
+
STANDARD = "STANDARD"
|
|
60
|
+
NONE = "NONE"
|
|
61
|
+
|
|
62
|
+
|
|
54
63
|
class TagOption(StrEnum):
|
|
55
64
|
with_lineage = "with_lineage"
|
|
56
65
|
without_lineage = "without_lineage"
|
|
@@ -59,13 +68,10 @@ class TagOption(StrEnum):
|
|
|
59
68
|
|
|
60
69
|
@dataclass(frozen=True)
|
|
61
70
|
class DatabaseId:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
platform_instance: Optional[str] =
|
|
66
|
-
default=None,
|
|
67
|
-
description="Platform instance of consumer snowflake account.",
|
|
68
|
-
)
|
|
71
|
+
# Database created from share in consumer account
|
|
72
|
+
database: str
|
|
73
|
+
# Platform instance of consumer snowflake account
|
|
74
|
+
platform_instance: Optional[str] = None
|
|
69
75
|
|
|
70
76
|
|
|
71
77
|
class SnowflakeShareConfig(ConfigModel):
|
|
@@ -100,7 +106,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
100
106
|
|
|
101
107
|
stream_pattern: AllowDenyPattern = Field(
|
|
102
108
|
default=AllowDenyPattern.allow_all(),
|
|
103
|
-
description="Regex patterns for streams to filter in ingestion.
|
|
109
|
+
description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
113
|
+
default=AllowDenyPattern.allow_all(),
|
|
114
|
+
description="Regex patterns for procedures to filter in ingestion. "
|
|
115
|
+
"Specify regex to match the entire procedure name in database.schema.procedure format. "
|
|
116
|
+
"e.g. to match all procedures starting with customer in Customer database and public schema,"
|
|
117
|
+
" use the regex 'Customer.public.customer.*'",
|
|
104
118
|
)
|
|
105
119
|
|
|
106
120
|
match_fully_qualified_names: bool = Field(
|
|
@@ -145,14 +159,11 @@ class SnowflakeIdentifierConfig(
|
|
|
145
159
|
|
|
146
160
|
email_domain: Optional[str] = pydantic.Field(
|
|
147
161
|
default=None,
|
|
148
|
-
description="Email domain of your organization so users can be displayed on UI appropriately.",
|
|
162
|
+
description="Email domain of your organization so users can be displayed on UI appropriately. This is used only if we cannot infer email ID.",
|
|
149
163
|
)
|
|
150
164
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
|
|
154
|
-
"provided, generates email addresses for snowflake users with unset emails, based on their "
|
|
155
|
-
"username.",
|
|
165
|
+
_email_as_user_identifier = pydantic_removed_field(
|
|
166
|
+
"email_as_user_identifier",
|
|
156
167
|
)
|
|
157
168
|
|
|
158
169
|
|
|
@@ -189,6 +200,7 @@ class SnowflakeV2Config(
|
|
|
189
200
|
SnowflakeUsageConfig,
|
|
190
201
|
StatefulLineageConfigMixin,
|
|
191
202
|
StatefulUsageConfigMixin,
|
|
203
|
+
StatefulTimeWindowConfigMixin,
|
|
192
204
|
StatefulProfilingConfigMixin,
|
|
193
205
|
ClassificationSourceConfigMixin,
|
|
194
206
|
IncrementalPropertiesConfigMixin,
|
|
@@ -203,6 +215,16 @@ class SnowflakeV2Config(
|
|
|
203
215
|
description="If enabled, populates the ingested views' definitions.",
|
|
204
216
|
)
|
|
205
217
|
|
|
218
|
+
fetch_views_from_information_schema: bool = Field(
|
|
219
|
+
default=False,
|
|
220
|
+
description="If enabled, uses information_schema.views to fetch view definitions instead of SHOW VIEWS command. "
|
|
221
|
+
"This alternative method can be more reliable for databases with large numbers of views (> 10K views), as the "
|
|
222
|
+
"SHOW VIEWS approach has proven unreliable and can lead to missing views in such scenarios. However, this method "
|
|
223
|
+
"requires OWNERSHIP privileges on views to retrieve their definitions. For views without ownership permissions "
|
|
224
|
+
"(where VIEW_DEFINITION is null/empty), the system will automatically fall back to using batched SHOW VIEWS queries "
|
|
225
|
+
"to populate the missing definitions.",
|
|
226
|
+
)
|
|
227
|
+
|
|
206
228
|
include_technical_schema: bool = Field(
|
|
207
229
|
default=True,
|
|
208
230
|
description="If enabled, populates the snowflake technical schema and descriptions.",
|
|
@@ -223,7 +245,7 @@ class SnowflakeV2Config(
|
|
|
223
245
|
)
|
|
224
246
|
|
|
225
247
|
use_queries_v2: bool = Field(
|
|
226
|
-
default=
|
|
248
|
+
default=True,
|
|
227
249
|
description="If enabled, uses the new queries extractor to extract queries from snowflake.",
|
|
228
250
|
)
|
|
229
251
|
include_queries: bool = Field(
|
|
@@ -241,6 +263,11 @@ class SnowflakeV2Config(
|
|
|
241
263
|
"This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
|
|
242
264
|
)
|
|
243
265
|
|
|
266
|
+
query_dedup_strategy: QueryDedupStrategyType = Field(
|
|
267
|
+
default=QueryDedupStrategyType.STANDARD,
|
|
268
|
+
description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
|
|
269
|
+
)
|
|
270
|
+
|
|
244
271
|
_check_role_grants_removed = pydantic_removed_field("check_role_grants")
|
|
245
272
|
_provision_role_removed = pydantic_removed_field("provision_role")
|
|
246
273
|
|
|
@@ -254,10 +281,11 @@ class SnowflakeV2Config(
|
|
|
254
281
|
description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
|
|
255
282
|
)
|
|
256
283
|
|
|
257
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
284
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
285
|
+
Field(
|
|
286
|
+
default=60,
|
|
287
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
288
|
+
)
|
|
261
289
|
)
|
|
262
290
|
|
|
263
291
|
include_external_url: bool = Field(
|
|
@@ -284,10 +312,16 @@ class SnowflakeV2Config(
|
|
|
284
312
|
description="If enabled, streams will be ingested as separate entities from tables/views.",
|
|
285
313
|
)
|
|
286
314
|
|
|
315
|
+
include_procedures: bool = Field(
|
|
316
|
+
default=True,
|
|
317
|
+
description="If enabled, procedures will be ingested as pipelines/tasks.",
|
|
318
|
+
)
|
|
319
|
+
|
|
287
320
|
structured_property_pattern: AllowDenyPattern = Field(
|
|
288
321
|
default=AllowDenyPattern.allow_all(),
|
|
289
322
|
description=(
|
|
290
323
|
"List of regex patterns for structured properties to include in ingestion."
|
|
324
|
+
" Applied to tags with form `<database>.<schema>.<tag_name>`."
|
|
291
325
|
" Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
|
|
292
326
|
),
|
|
293
327
|
)
|
|
@@ -300,7 +334,7 @@ class SnowflakeV2Config(
|
|
|
300
334
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
301
335
|
)
|
|
302
336
|
|
|
303
|
-
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
|
|
337
|
+
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( # type: ignore[pydantic-field]
|
|
304
338
|
"upstreams_deny_pattern", "temporary_tables_pattern"
|
|
305
339
|
)
|
|
306
340
|
|
|
@@ -312,6 +346,17 @@ class SnowflakeV2Config(
|
|
|
312
346
|
" Map of share name -> details of share.",
|
|
313
347
|
)
|
|
314
348
|
|
|
349
|
+
known_snowflake_edition: Optional[SnowflakeEdition] = Field(
|
|
350
|
+
default=None,
|
|
351
|
+
description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Allows empty containers to be ingested before datasets are added, avoiding permission errors
|
|
355
|
+
warn_no_datasets: HiddenFromDocs[bool] = Field(
|
|
356
|
+
default=False,
|
|
357
|
+
description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
|
|
358
|
+
)
|
|
359
|
+
|
|
315
360
|
include_assertion_results: bool = Field(
|
|
316
361
|
default=False,
|
|
317
362
|
description="Whether to ingest assertion run results for assertions created using Datahub"
|
|
@@ -320,11 +365,32 @@ class SnowflakeV2Config(
|
|
|
320
365
|
|
|
321
366
|
pushdown_deny_usernames: List[str] = Field(
|
|
322
367
|
default=[],
|
|
323
|
-
description="List of snowflake usernames which will
|
|
368
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
|
|
324
369
|
"This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
|
|
325
370
|
"Only applicable if `use_queries_v2` is enabled.",
|
|
326
371
|
)
|
|
327
372
|
|
|
373
|
+
pushdown_allow_usernames: List[str] = Field(
|
|
374
|
+
default=[],
|
|
375
|
+
description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
|
|
376
|
+
"This is primarily useful for improving performance by filtering in only specific users. "
|
|
377
|
+
"Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
push_down_database_pattern_access_history: bool = Field(
|
|
381
|
+
default=False,
|
|
382
|
+
description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
|
|
383
|
+
"This filters on the accessed objects in access_history.",
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
additional_database_names_allowlist: List[str] = Field(
|
|
387
|
+
default=[],
|
|
388
|
+
description="Additional database names (no pattern matching) to be included in the access_history filter. "
|
|
389
|
+
"Only applies if push_down_database_pattern_access_history=True. "
|
|
390
|
+
"These databases will be included in the filter being pushed down regardless of database_pattern settings."
|
|
391
|
+
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
392
|
+
)
|
|
393
|
+
|
|
328
394
|
@validator("convert_urns_to_lowercase")
|
|
329
395
|
def validate_convert_urns_to_lowercase(cls, v):
|
|
330
396
|
if not v:
|
|
@@ -371,17 +437,6 @@ class SnowflakeV2Config(
|
|
|
371
437
|
|
|
372
438
|
return values
|
|
373
439
|
|
|
374
|
-
def get_sql_alchemy_url(
|
|
375
|
-
self,
|
|
376
|
-
database: Optional[str] = None,
|
|
377
|
-
username: Optional[str] = None,
|
|
378
|
-
password: Optional[SecretStr] = None,
|
|
379
|
-
role: Optional[str] = None,
|
|
380
|
-
) -> str:
|
|
381
|
-
return SnowflakeConnectionConfig.get_sql_alchemy_url(
|
|
382
|
-
self, database=database, username=username, password=password, role=role
|
|
383
|
-
)
|
|
384
|
-
|
|
385
440
|
@validator("shares")
|
|
386
441
|
def validate_shares(
|
|
387
442
|
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
|
|
@@ -424,6 +479,20 @@ class SnowflakeV2Config(
|
|
|
424
479
|
|
|
425
480
|
return shares
|
|
426
481
|
|
|
482
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
483
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
484
|
+
if values.get("use_queries_v2"):
|
|
485
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
486
|
+
"enable_stateful_usage_ingestion"
|
|
487
|
+
):
|
|
488
|
+
logger.warning(
|
|
489
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
490
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
491
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
492
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
493
|
+
)
|
|
494
|
+
return values
|
|
495
|
+
|
|
427
496
|
def outbounds(self) -> Dict[str, Set[DatabaseId]]:
|
|
428
497
|
"""
|
|
429
498
|
Returns mapping of
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import threading
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
3
4
|
|
|
4
5
|
import pydantic
|
|
@@ -14,20 +15,26 @@ from snowflake.connector.network import (
|
|
|
14
15
|
OAUTH_AUTHENTICATOR,
|
|
15
16
|
)
|
|
16
17
|
|
|
17
|
-
from datahub.configuration.common import
|
|
18
|
+
from datahub.configuration.common import (
|
|
19
|
+
ConfigModel,
|
|
20
|
+
ConfigurationError,
|
|
21
|
+
HiddenFromDocs,
|
|
22
|
+
MetaError,
|
|
23
|
+
)
|
|
18
24
|
from datahub.configuration.connection_resolver import auto_connection_resolver
|
|
19
25
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
20
26
|
from datahub.ingestion.api.closeable import Closeable
|
|
21
27
|
from datahub.ingestion.source.snowflake.constants import (
|
|
22
28
|
CLIENT_PREFETCH_THREADS,
|
|
23
29
|
CLIENT_SESSION_KEEP_ALIVE,
|
|
30
|
+
DEFAULT_SNOWFLAKE_DOMAIN,
|
|
24
31
|
)
|
|
25
32
|
from datahub.ingestion.source.snowflake.oauth_config import (
|
|
26
33
|
OAuthConfiguration,
|
|
27
34
|
OAuthIdentityProvider,
|
|
28
35
|
)
|
|
29
36
|
from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
|
|
30
|
-
from datahub.ingestion.source.sql.
|
|
37
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
31
38
|
from datahub.utilities.config_clean import (
|
|
32
39
|
remove_protocol,
|
|
33
40
|
remove_suffix,
|
|
@@ -46,8 +53,6 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
|
|
|
46
53
|
"OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
|
|
47
54
|
}
|
|
48
55
|
|
|
49
|
-
_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
|
|
50
|
-
|
|
51
56
|
|
|
52
57
|
class SnowflakePermissionError(MetaError):
|
|
53
58
|
"""A permission error has happened"""
|
|
@@ -63,7 +68,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
63
68
|
description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
|
|
64
69
|
)
|
|
65
70
|
|
|
66
|
-
scheme: str = "snowflake"
|
|
71
|
+
scheme: HiddenFromDocs[str] = "snowflake"
|
|
67
72
|
username: Optional[str] = pydantic.Field(
|
|
68
73
|
default=None, description="Snowflake username."
|
|
69
74
|
)
|
|
@@ -109,18 +114,25 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
109
114
|
default=None,
|
|
110
115
|
description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
|
|
111
116
|
)
|
|
117
|
+
snowflake_domain: str = pydantic.Field(
|
|
118
|
+
default=DEFAULT_SNOWFLAKE_DOMAIN,
|
|
119
|
+
description="Snowflake domain. Use 'snowflakecomputing.com' for most regions or 'snowflakecomputing.cn' for China (cn-northwest-1) region.",
|
|
120
|
+
)
|
|
112
121
|
|
|
113
122
|
def get_account(self) -> str:
|
|
114
123
|
assert self.account_id
|
|
115
124
|
return self.account_id
|
|
116
125
|
|
|
117
|
-
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
|
|
126
|
+
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
|
|
118
127
|
|
|
119
128
|
@pydantic.validator("account_id")
|
|
120
|
-
def validate_account_id(cls, account_id: str) -> str:
|
|
129
|
+
def validate_account_id(cls, account_id: str, values: Dict) -> str:
|
|
121
130
|
account_id = remove_protocol(account_id)
|
|
122
131
|
account_id = remove_trailing_slashes(account_id)
|
|
123
|
-
|
|
132
|
+
# Get the domain from config, fallback to default
|
|
133
|
+
domain = values.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
|
|
134
|
+
snowflake_host_suffix = f".{domain}"
|
|
135
|
+
account_id = remove_suffix(account_id, snowflake_host_suffix)
|
|
124
136
|
return account_id
|
|
125
137
|
|
|
126
138
|
@pydantic.validator("authentication_type", always=True)
|
|
@@ -192,23 +204,11 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
192
204
|
"but should be set when using use_certificate false for oauth_config"
|
|
193
205
|
)
|
|
194
206
|
|
|
195
|
-
def get_sql_alchemy_url(
|
|
196
|
-
self,
|
|
197
|
-
database: Optional[str] = None,
|
|
198
|
-
username: Optional[str] = None,
|
|
199
|
-
password: Optional[pydantic.SecretStr] = None,
|
|
200
|
-
role: Optional[str] = None,
|
|
201
|
-
) -> str:
|
|
202
|
-
if username is None:
|
|
203
|
-
username = self.username
|
|
204
|
-
if password is None:
|
|
205
|
-
password = self.password
|
|
206
|
-
if role is None:
|
|
207
|
-
role = self.role
|
|
207
|
+
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
208
208
|
return make_sqlalchemy_uri(
|
|
209
209
|
self.scheme,
|
|
210
|
-
username,
|
|
211
|
-
password.get_secret_value() if password else None,
|
|
210
|
+
self.username,
|
|
211
|
+
self.password.get_secret_value() if self.password else None,
|
|
212
212
|
self.account_id,
|
|
213
213
|
f'"{database}"' if database is not None else database,
|
|
214
214
|
uri_opts={
|
|
@@ -217,7 +217,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
217
217
|
for (key, value) in {
|
|
218
218
|
"authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
|
|
219
219
|
"warehouse": self.warehouse,
|
|
220
|
-
"role": role,
|
|
220
|
+
"role": self.role,
|
|
221
221
|
"application": _APPLICATION_NAME,
|
|
222
222
|
}.items()
|
|
223
223
|
if value
|
|
@@ -322,6 +322,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
322
322
|
warehouse=self.warehouse,
|
|
323
323
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
324
324
|
application=_APPLICATION_NAME,
|
|
325
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
325
326
|
**connect_args,
|
|
326
327
|
)
|
|
327
328
|
|
|
@@ -335,6 +336,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
335
336
|
role=self.role,
|
|
336
337
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
337
338
|
application=_APPLICATION_NAME,
|
|
339
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
338
340
|
**connect_args,
|
|
339
341
|
)
|
|
340
342
|
|
|
@@ -348,6 +350,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
348
350
|
warehouse=self.warehouse,
|
|
349
351
|
role=self.role,
|
|
350
352
|
application=_APPLICATION_NAME,
|
|
353
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
351
354
|
**connect_args,
|
|
352
355
|
)
|
|
353
356
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
@@ -359,6 +362,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
359
362
|
warehouse=self.warehouse,
|
|
360
363
|
role=self.role,
|
|
361
364
|
application=_APPLICATION_NAME,
|
|
365
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
362
366
|
**connect_args,
|
|
363
367
|
)
|
|
364
368
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR":
|
|
@@ -374,6 +378,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
374
378
|
role=self.role,
|
|
375
379
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
376
380
|
application=_APPLICATION_NAME,
|
|
381
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
377
382
|
**connect_args,
|
|
378
383
|
)
|
|
379
384
|
else:
|
|
@@ -402,13 +407,30 @@ class SnowflakeConnection(Closeable):
|
|
|
402
407
|
def __init__(self, connection: NativeSnowflakeConnection):
|
|
403
408
|
self._connection = connection
|
|
404
409
|
|
|
410
|
+
self._query_num_lock = threading.Lock()
|
|
411
|
+
self._query_num = 1
|
|
412
|
+
|
|
405
413
|
def native_connection(self) -> NativeSnowflakeConnection:
|
|
406
414
|
return self._connection
|
|
407
415
|
|
|
416
|
+
def get_query_no(self) -> int:
|
|
417
|
+
with self._query_num_lock:
|
|
418
|
+
no = self._query_num
|
|
419
|
+
self._query_num += 1
|
|
420
|
+
return no
|
|
421
|
+
|
|
408
422
|
def query(self, query: str) -> Any:
|
|
409
423
|
try:
|
|
410
|
-
|
|
424
|
+
# We often run multiple queries in parallel across multiple threads,
|
|
425
|
+
# so we need to number them to help with log readability.
|
|
426
|
+
query_num = self.get_query_no()
|
|
427
|
+
logger.info(f"Query #{query_num}: {query.rstrip()}", stacklevel=2)
|
|
411
428
|
resp = self._connection.cursor(DictCursor).execute(query)
|
|
429
|
+
if resp is not None and resp.rowcount is not None:
|
|
430
|
+
logger.info(
|
|
431
|
+
f"Query #{query_num} got {resp.rowcount} row(s) back from Snowflake",
|
|
432
|
+
stacklevel=2,
|
|
433
|
+
)
|
|
412
434
|
return resp
|
|
413
435
|
|
|
414
436
|
except Exception as e:
|
|
@@ -2,7 +2,17 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
TYPE_CHECKING,
|
|
7
|
+
Any,
|
|
8
|
+
Collection,
|
|
9
|
+
Iterable,
|
|
10
|
+
List,
|
|
11
|
+
Optional,
|
|
12
|
+
Set,
|
|
13
|
+
Tuple,
|
|
14
|
+
Type,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
from pydantic import BaseModel, Field, validator
|
|
8
18
|
|
|
@@ -44,6 +54,9 @@ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
|
44
54
|
from datahub.utilities.perf_timer import PerfTimer
|
|
45
55
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
46
56
|
|
|
57
|
+
if TYPE_CHECKING:
|
|
58
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
59
|
+
|
|
47
60
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
48
61
|
|
|
49
62
|
EXTERNAL_LINEAGE = "external_lineage"
|
|
@@ -51,7 +64,7 @@ TABLE_LINEAGE = "table_lineage"
|
|
|
51
64
|
VIEW_LINEAGE = "view_lineage"
|
|
52
65
|
|
|
53
66
|
|
|
54
|
-
def pydantic_parse_json(field: str) ->
|
|
67
|
+
def pydantic_parse_json(field: str) -> "V1Validator":
|
|
55
68
|
def _parse_from_json(cls: Type, v: Any) -> dict:
|
|
56
69
|
if isinstance(v, str):
|
|
57
70
|
return json.loads(v)
|
|
@@ -72,7 +85,7 @@ class ColumnUpstreamJob(BaseModel):
|
|
|
72
85
|
|
|
73
86
|
|
|
74
87
|
class ColumnUpstreamLineage(BaseModel):
|
|
75
|
-
column_name: Optional[str]
|
|
88
|
+
column_name: Optional[str] = None
|
|
76
89
|
upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
|
|
77
90
|
|
|
78
91
|
|
|
@@ -91,9 +104,9 @@ class Query(BaseModel):
|
|
|
91
104
|
class UpstreamLineageEdge(BaseModel):
|
|
92
105
|
DOWNSTREAM_TABLE_NAME: str
|
|
93
106
|
DOWNSTREAM_TABLE_DOMAIN: str
|
|
94
|
-
UPSTREAM_TABLES: Optional[List[UpstreamTableNode]]
|
|
95
|
-
UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]]
|
|
96
|
-
QUERIES: Optional[List[Query]]
|
|
107
|
+
UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None
|
|
108
|
+
UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None
|
|
109
|
+
QUERIES: Optional[List[Query]] = None
|
|
97
110
|
|
|
98
111
|
_json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES")
|
|
99
112
|
_json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS")
|
|
@@ -360,6 +373,12 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
360
373
|
self, db_row: dict
|
|
361
374
|
) -> Optional[UpstreamLineageEdge]:
|
|
362
375
|
try:
|
|
376
|
+
_queries = db_row.get("QUERIES")
|
|
377
|
+
if _queries == "[\n {}\n]":
|
|
378
|
+
# We are creating an empty object in the list when there are no queries
|
|
379
|
+
# To avoid that causing a pydantic error we are setting it to an empty list
|
|
380
|
+
# instead of a list with an empty object
|
|
381
|
+
db_row["QUERIES"] = "[]"
|
|
363
382
|
return UpstreamLineageEdge.parse_obj(db_row)
|
|
364
383
|
except Exception as e:
|
|
365
384
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
|
|
|
135
135
|
) -> "DatahubGEProfiler":
|
|
136
136
|
assert db_name
|
|
137
137
|
|
|
138
|
-
url = self.config.get_sql_alchemy_url(
|
|
139
|
-
database=db_name,
|
|
140
|
-
username=self.config.username,
|
|
141
|
-
password=self.config.password,
|
|
142
|
-
role=self.config.role,
|
|
143
|
-
)
|
|
138
|
+
url = self.config.get_sql_alchemy_url(database=db_name)
|
|
144
139
|
|
|
145
140
|
logger.debug(f"sql_alchemy_url={url}")
|
|
146
141
|
|