acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,7 @@ import concurrent.futures
|
|
|
5
5
|
import contextlib
|
|
6
6
|
import dataclasses
|
|
7
7
|
import functools
|
|
8
|
+
import importlib.metadata
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import re
|
|
@@ -51,6 +52,7 @@ from typing_extensions import Concatenate, ParamSpec
|
|
|
51
52
|
from datahub.emitter import mce_builder
|
|
52
53
|
from datahub.emitter.mce_builder import get_sys_time
|
|
53
54
|
from datahub.ingestion.graph.client import get_default_graph
|
|
55
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
54
56
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
55
57
|
from datahub.ingestion.source.profiling.common import (
|
|
56
58
|
Cardinality,
|
|
@@ -83,6 +85,30 @@ if TYPE_CHECKING:
|
|
|
83
85
|
from pyathena.cursor import Cursor
|
|
84
86
|
|
|
85
87
|
assert MARKUPSAFE_PATCHED
|
|
88
|
+
|
|
89
|
+
# We need to ensure that acryl-great-expectations is installed
|
|
90
|
+
# and great-expectations is not installed.
|
|
91
|
+
try:
|
|
92
|
+
acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
|
|
93
|
+
except importlib.metadata.PackageNotFoundError:
|
|
94
|
+
acryl_gx_version = False
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
|
|
98
|
+
except importlib.metadata.PackageNotFoundError:
|
|
99
|
+
original_gx_version = False
|
|
100
|
+
|
|
101
|
+
if acryl_gx_version and original_gx_version:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
"acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
|
|
104
|
+
"You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
|
|
105
|
+
"See https://github.com/pypa/pip/issues/4625."
|
|
106
|
+
)
|
|
107
|
+
elif original_gx_version:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
"We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
|
|
110
|
+
)
|
|
111
|
+
|
|
86
112
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
87
113
|
|
|
88
114
|
_original_get_column_median = SqlAlchemyDataset.get_column_median
|
|
@@ -94,7 +120,6 @@ SNOWFLAKE = "snowflake"
|
|
|
94
120
|
BIGQUERY = "bigquery"
|
|
95
121
|
REDSHIFT = "redshift"
|
|
96
122
|
DATABRICKS = "databricks"
|
|
97
|
-
TRINO = "trino"
|
|
98
123
|
|
|
99
124
|
# Type names for Databricks, to match Title Case types in sqlalchemy
|
|
100
125
|
ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
|
|
@@ -180,6 +205,25 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
180
205
|
)
|
|
181
206
|
)
|
|
182
207
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
208
|
+
elif (
|
|
209
|
+
self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
|
|
210
|
+
or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
|
|
211
|
+
):
|
|
212
|
+
return convert_to_json_serializable(
|
|
213
|
+
self.engine.execute(
|
|
214
|
+
sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
|
|
215
|
+
self._table
|
|
216
|
+
)
|
|
217
|
+
).scalar()
|
|
218
|
+
)
|
|
219
|
+
elif self.engine.dialect.name.lower() == DATABRICKS:
|
|
220
|
+
return convert_to_json_serializable(
|
|
221
|
+
self.engine.execute(
|
|
222
|
+
sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from(
|
|
223
|
+
self._table
|
|
224
|
+
)
|
|
225
|
+
).scalar()
|
|
226
|
+
)
|
|
183
227
|
return convert_to_json_serializable(
|
|
184
228
|
self.engine.execute(
|
|
185
229
|
sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
|
|
@@ -263,7 +307,6 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
263
307
|
"get_column_max",
|
|
264
308
|
"get_column_mean",
|
|
265
309
|
"get_column_stdev",
|
|
266
|
-
"get_column_nonnull_count",
|
|
267
310
|
"get_column_unique_count",
|
|
268
311
|
}
|
|
269
312
|
CONSTANT_ROW_QUERY_METHODS = {
|
|
@@ -287,6 +330,7 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
287
330
|
|
|
288
331
|
FIRST_PARTY_SINGLE_ROW_QUERY_METHODS = {
|
|
289
332
|
"get_column_unique_count_dh_patch",
|
|
333
|
+
"_get_column_cardinality",
|
|
290
334
|
}
|
|
291
335
|
|
|
292
336
|
# We'll do this the inefficient way since the arrays are pretty small.
|
|
@@ -453,7 +497,20 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
453
497
|
self, column_spec: _SingleColumnSpec, column: str
|
|
454
498
|
) -> None:
|
|
455
499
|
try:
|
|
456
|
-
|
|
500
|
+
# Don't use Great Expectations get_column_nonnull_count because it
|
|
501
|
+
# generates this SQL:
|
|
502
|
+
#
|
|
503
|
+
# sum(CASE WHEN (mycolumn IN (NULL) OR mycolumn IS NULL) THEN 1 ELSE 0 END)
|
|
504
|
+
#
|
|
505
|
+
# which fails for complex types (such as Databricks maps) that don't
|
|
506
|
+
# support the IN operator.
|
|
507
|
+
nonnull_count = convert_to_json_serializable(
|
|
508
|
+
self.dataset.engine.execute(
|
|
509
|
+
sa.select(sa.func.count(sa.column(column))).select_from(
|
|
510
|
+
self.dataset._table
|
|
511
|
+
)
|
|
512
|
+
).scalar()
|
|
513
|
+
)
|
|
457
514
|
column_spec.nonnull_count = nonnull_count
|
|
458
515
|
except Exception as e:
|
|
459
516
|
logger.debug(
|
|
@@ -602,7 +659,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
602
659
|
if not self.config.include_field_median_value:
|
|
603
660
|
return
|
|
604
661
|
try:
|
|
605
|
-
if self.dataset.engine.dialect.name.lower()
|
|
662
|
+
if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
|
|
606
663
|
column_profile.median = str(
|
|
607
664
|
self.dataset.engine.execute(
|
|
608
665
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -610,6 +667,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
610
667
|
)
|
|
611
668
|
).scalar()
|
|
612
669
|
)
|
|
670
|
+
elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
|
|
671
|
+
column_profile.median = str(
|
|
672
|
+
self.dataset.engine.execute(
|
|
673
|
+
sa.select(
|
|
674
|
+
sa.text(
|
|
675
|
+
f"approx_percentile(`{column}`, 0.5) as approx_median"
|
|
676
|
+
)
|
|
677
|
+
).select_from(self.dataset._table)
|
|
678
|
+
).scalar()
|
|
679
|
+
)
|
|
613
680
|
elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
|
|
614
681
|
column_profile.median = str(
|
|
615
682
|
self.dataset.engine.execute(
|
|
@@ -698,11 +765,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
698
765
|
def _get_dataset_column_distinct_value_frequencies(
|
|
699
766
|
self, column_profile: DatasetFieldProfileClass, column: str
|
|
700
767
|
) -> None:
|
|
701
|
-
if self.config.include_field_distinct_value_frequencies:
|
|
768
|
+
if not self.config.include_field_distinct_value_frequencies:
|
|
769
|
+
return
|
|
770
|
+
try:
|
|
771
|
+
results = self.dataset.engine.execute(
|
|
772
|
+
sa.select(
|
|
773
|
+
[
|
|
774
|
+
sa.column(column),
|
|
775
|
+
sa.func.count(sa.column(column)),
|
|
776
|
+
]
|
|
777
|
+
)
|
|
778
|
+
.select_from(self.dataset._table)
|
|
779
|
+
.where(sa.column(column).is_not(None))
|
|
780
|
+
.group_by(sa.column(column))
|
|
781
|
+
).fetchall()
|
|
782
|
+
|
|
702
783
|
column_profile.distinctValueFrequencies = [
|
|
703
|
-
ValueFrequencyClass(value=str(value), frequency=count)
|
|
704
|
-
for value, count in
|
|
784
|
+
ValueFrequencyClass(value=str(value), frequency=int(count))
|
|
785
|
+
for value, count in results
|
|
705
786
|
]
|
|
787
|
+
# sort so output is deterministic. don't do it in SQL because not all column
|
|
788
|
+
# types are sortable in SQL (such as JSON data types on Athena/Trino).
|
|
789
|
+
column_profile.distinctValueFrequencies = sorted(
|
|
790
|
+
column_profile.distinctValueFrequencies, key=lambda x: x.value
|
|
791
|
+
)
|
|
792
|
+
except Exception as e:
|
|
793
|
+
logger.debug(
|
|
794
|
+
f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
self.report.report_warning(
|
|
798
|
+
title="Profiling: Unable to Calculate Distinct Value Frequencies",
|
|
799
|
+
message="Distinct value frequencies for the column will not be accessible",
|
|
800
|
+
context=f"{self.dataset_name}.{column}",
|
|
801
|
+
exc=e,
|
|
802
|
+
)
|
|
706
803
|
|
|
707
804
|
@_run_with_query_combiner
|
|
708
805
|
def _get_dataset_column_histogram(
|
|
@@ -1137,26 +1234,34 @@ class DatahubGEProfiler:
|
|
|
1137
1234
|
f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
|
|
1138
1235
|
)
|
|
1139
1236
|
|
|
1140
|
-
with
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1237
|
+
with (
|
|
1238
|
+
PerfTimer() as timer,
|
|
1239
|
+
unittest.mock.patch(
|
|
1240
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
|
|
1241
|
+
get_column_unique_count_dh_patch,
|
|
1242
|
+
),
|
|
1243
|
+
unittest.mock.patch(
|
|
1244
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
|
|
1245
|
+
_get_column_quantiles_bigquery_patch,
|
|
1246
|
+
),
|
|
1247
|
+
unittest.mock.patch(
|
|
1248
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
|
|
1249
|
+
_get_column_quantiles_awsathena_patch,
|
|
1250
|
+
),
|
|
1251
|
+
unittest.mock.patch(
|
|
1252
|
+
"great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
|
|
1253
|
+
_get_column_median_patch,
|
|
1254
|
+
),
|
|
1255
|
+
concurrent.futures.ThreadPoolExecutor(
|
|
1256
|
+
max_workers=max_workers
|
|
1257
|
+
) as async_executor,
|
|
1258
|
+
SQLAlchemyQueryCombiner(
|
|
1259
|
+
enabled=self.config.query_combiner_enabled,
|
|
1260
|
+
catch_exceptions=self.config.catch_exceptions,
|
|
1261
|
+
is_single_row_query_method=_is_single_row_query_method,
|
|
1262
|
+
serial_execution_fallback_enabled=True,
|
|
1263
|
+
).activate() as query_combiner,
|
|
1264
|
+
):
|
|
1160
1265
|
# Submit the profiling requests to the thread pool executor.
|
|
1161
1266
|
async_profiles = collections.deque(
|
|
1162
1267
|
async_executor.submit(
|
|
@@ -1359,12 +1464,12 @@ class DatahubGEProfiler:
|
|
|
1359
1464
|
)
|
|
1360
1465
|
return None
|
|
1361
1466
|
finally:
|
|
1362
|
-
if batch is not None and self.base_engine.engine.name.
|
|
1363
|
-
|
|
1364
|
-
|
|
1467
|
+
if batch is not None and self.base_engine.engine.name.lower() in [
|
|
1468
|
+
GXSqlDialect.TRINO,
|
|
1469
|
+
GXSqlDialect.AWSATHENA,
|
|
1365
1470
|
]:
|
|
1366
1471
|
if (
|
|
1367
|
-
self.base_engine.engine.name.
|
|
1472
|
+
self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
|
|
1368
1473
|
or temp_view is not None
|
|
1369
1474
|
):
|
|
1370
1475
|
self._drop_temp_table(batch)
|
|
@@ -1413,9 +1518,17 @@ class DatahubGEProfiler:
|
|
|
1413
1518
|
logger.error(
|
|
1414
1519
|
f"Unexpected {pretty_name} while profiling. Should have 3 parts but has {len(name_parts)} parts."
|
|
1415
1520
|
)
|
|
1521
|
+
if platform == DATABRICKS:
|
|
1522
|
+
# TODO: Review logic for BigQuery as well, probably project.dataset.table should be quoted there as well
|
|
1523
|
+
quoted_name = ".".join(
|
|
1524
|
+
batch.engine.dialect.identifier_preparer.quote(part)
|
|
1525
|
+
for part in name_parts
|
|
1526
|
+
)
|
|
1527
|
+
batch._table = sa.text(quoted_name)
|
|
1528
|
+
logger.debug(f"Setting quoted table name to be {batch._table}")
|
|
1416
1529
|
# If we only have two parts that means the project_id is missing from the table name and we add it
|
|
1417
1530
|
# Temp tables has 3 parts while normal tables only has 2 parts
|
|
1418
|
-
|
|
1531
|
+
elif len(str(batch._table).split(".")) == 2:
|
|
1419
1532
|
batch._table = sa.text(f"{name_parts[0]}.{str(batch._table)}")
|
|
1420
1533
|
logger.debug(f"Setting table name to be {batch._table}")
|
|
1421
1534
|
|
|
@@ -1559,7 +1672,7 @@ def _get_columns_to_ignore_sampling(
|
|
|
1559
1672
|
name=dataset_name, platform=platform, env=env
|
|
1560
1673
|
)
|
|
1561
1674
|
|
|
1562
|
-
datahub_graph = get_default_graph()
|
|
1675
|
+
datahub_graph = get_default_graph(ClientMode.INGESTION)
|
|
1563
1676
|
|
|
1564
1677
|
dataset_tags = datahub_graph.get_tags(dataset_urn)
|
|
1565
1678
|
if dataset_tags:
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Annotated, Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
|
|
10
10
|
from datahub.ingestion.source_config.operation_config import OperationConfig
|
|
11
11
|
|
|
12
12
|
_PROFILING_FLAGS_TO_REPORT = {
|
|
@@ -120,28 +120,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
120
120
|
"number of columns to profile goes up.",
|
|
121
121
|
)
|
|
122
122
|
|
|
123
|
-
profile_if_updated_since_days:
|
|
123
|
+
profile_if_updated_since_days: Annotated[
|
|
124
|
+
Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
|
|
125
|
+
] = Field(
|
|
124
126
|
default=None,
|
|
125
127
|
description="Profile table only if it has been updated since these many number of days. "
|
|
126
128
|
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
129
|
"Supported only in `snowflake` and `BigQuery`.",
|
|
128
130
|
)
|
|
129
131
|
|
|
130
|
-
profile_table_size_limit:
|
|
132
|
+
profile_table_size_limit: Annotated[
|
|
133
|
+
Optional[int],
|
|
134
|
+
SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
|
|
135
|
+
] = Field(
|
|
131
136
|
default=5,
|
|
132
137
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
133
138
|
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
139
|
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
135
140
|
)
|
|
136
141
|
|
|
137
|
-
profile_table_row_limit:
|
|
142
|
+
profile_table_row_limit: Annotated[
|
|
143
|
+
Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
|
|
144
|
+
] = Field(
|
|
138
145
|
default=5000000,
|
|
139
146
|
description="Profile tables only if their row count is less than specified count. "
|
|
140
147
|
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
148
|
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
142
149
|
)
|
|
143
150
|
|
|
144
|
-
profile_table_row_count_estimate_only:
|
|
151
|
+
profile_table_row_count_estimate_only: Annotated[
|
|
152
|
+
bool, SupportedSources(["postgres", "mysql"])
|
|
153
|
+
] = Field(
|
|
145
154
|
default=False,
|
|
146
155
|
description="Use an approximate query for row count. This will be much faster but slightly "
|
|
147
156
|
"less accurate. Only supported for Postgres and MySQL. ",
|
|
@@ -157,29 +166,35 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
157
166
|
# Hidden option - used for debugging purposes.
|
|
158
167
|
catch_exceptions: bool = Field(default=True, description="")
|
|
159
168
|
|
|
160
|
-
partition_profiling_enabled:
|
|
169
|
+
partition_profiling_enabled: Annotated[
|
|
170
|
+
bool, SupportedSources(["athena", "bigquery"])
|
|
171
|
+
] = Field(
|
|
161
172
|
default=True,
|
|
162
173
|
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
|
163
174
|
"If enabled, latest partition data is used for profiling.",
|
|
164
175
|
)
|
|
165
|
-
partition_datetime:
|
|
176
|
+
partition_datetime: Annotated[
|
|
177
|
+
Optional[datetime.datetime], SupportedSources(["bigquery"])
|
|
178
|
+
] = Field(
|
|
166
179
|
default=None,
|
|
167
180
|
description="If specified, profile only the partition which matches this datetime. "
|
|
168
181
|
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
|
169
182
|
)
|
|
170
|
-
use_sampling: bool = Field(
|
|
183
|
+
use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
171
184
|
default=True,
|
|
172
185
|
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
|
173
186
|
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
|
174
187
|
)
|
|
175
188
|
|
|
176
|
-
sample_size: int = Field(
|
|
189
|
+
sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
177
190
|
default=10000,
|
|
178
191
|
description="Number of rows to be sampled from table for column level profiling."
|
|
179
192
|
"Applicable only if `use_sampling` is set to True.",
|
|
180
193
|
)
|
|
181
194
|
|
|
182
|
-
profile_external_tables:
|
|
195
|
+
profile_external_tables: Annotated[
|
|
196
|
+
bool, SupportedSources(["redshift", "snowflake"])
|
|
197
|
+
] = Field(
|
|
183
198
|
default=False,
|
|
184
199
|
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
|
185
200
|
)
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from datahub.emitter.mce_builder import (
|
|
4
|
+
make_chart_urn,
|
|
5
|
+
make_dashboard_urn,
|
|
6
|
+
make_data_platform_urn,
|
|
7
|
+
make_dataplatform_instance_urn,
|
|
8
|
+
make_dataset_urn_with_platform_instance,
|
|
9
|
+
make_tag_urn,
|
|
10
|
+
make_user_urn,
|
|
11
|
+
)
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
+
from datahub.ingestion.source.grafana.models import Dashboard, Panel
|
|
14
|
+
from datahub.ingestion.source.grafana.types import CHART_TYPE_MAPPINGS
|
|
15
|
+
from datahub.metadata.schema_classes import (
|
|
16
|
+
ChangeAuditStampsClass,
|
|
17
|
+
ChartInfoClass,
|
|
18
|
+
DashboardInfoClass,
|
|
19
|
+
DataPlatformInstanceClass,
|
|
20
|
+
GlobalTagsClass,
|
|
21
|
+
OwnerClass,
|
|
22
|
+
OwnershipClass,
|
|
23
|
+
OwnershipTypeClass,
|
|
24
|
+
StatusClass,
|
|
25
|
+
TagAssociationClass,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_chart_mcps(
|
|
30
|
+
panel: Panel,
|
|
31
|
+
dashboard: Dashboard,
|
|
32
|
+
platform: str,
|
|
33
|
+
platform_instance: Optional[str],
|
|
34
|
+
env: str,
|
|
35
|
+
base_url: str,
|
|
36
|
+
ingest_tags: bool,
|
|
37
|
+
) -> Tuple[Optional[str], str, List[MetadataChangeProposalWrapper]]:
|
|
38
|
+
"""Build chart metadata change proposals"""
|
|
39
|
+
ds_urn = None
|
|
40
|
+
mcps = []
|
|
41
|
+
|
|
42
|
+
chart_urn = make_chart_urn(
|
|
43
|
+
platform,
|
|
44
|
+
f"{dashboard.uid}.{panel.id}",
|
|
45
|
+
platform_instance,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Platform instance aspect
|
|
49
|
+
mcps.append(
|
|
50
|
+
MetadataChangeProposalWrapper(
|
|
51
|
+
entityUrn=chart_urn,
|
|
52
|
+
aspect=DataPlatformInstanceClass(
|
|
53
|
+
platform=make_data_platform_urn(platform),
|
|
54
|
+
instance=make_dataplatform_instance_urn(
|
|
55
|
+
platform=platform,
|
|
56
|
+
instance=platform_instance,
|
|
57
|
+
)
|
|
58
|
+
if platform_instance
|
|
59
|
+
else None,
|
|
60
|
+
),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Status aspect
|
|
65
|
+
mcps.append(
|
|
66
|
+
MetadataChangeProposalWrapper(
|
|
67
|
+
entityUrn=chart_urn,
|
|
68
|
+
aspect=StatusClass(removed=False),
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Get input datasets
|
|
73
|
+
input_datasets = []
|
|
74
|
+
if panel.datasource_ref:
|
|
75
|
+
ds_type = panel.datasource_ref.type or "unknown"
|
|
76
|
+
ds_uid = panel.datasource_ref.uid or "unknown"
|
|
77
|
+
|
|
78
|
+
# Add Grafana dataset
|
|
79
|
+
dataset_name = f"{ds_type}.{ds_uid}.{panel.id}"
|
|
80
|
+
ds_urn = make_dataset_urn_with_platform_instance(
|
|
81
|
+
platform=platform,
|
|
82
|
+
name=dataset_name,
|
|
83
|
+
platform_instance=platform_instance,
|
|
84
|
+
env=env,
|
|
85
|
+
)
|
|
86
|
+
input_datasets.append(ds_urn)
|
|
87
|
+
|
|
88
|
+
# Chart info aspect
|
|
89
|
+
title = panel.title or f"Panel {panel.id}"
|
|
90
|
+
mcps.append(
|
|
91
|
+
MetadataChangeProposalWrapper(
|
|
92
|
+
entityUrn=chart_urn,
|
|
93
|
+
aspect=ChartInfoClass(
|
|
94
|
+
type=CHART_TYPE_MAPPINGS.get(panel.type) if panel.type else None,
|
|
95
|
+
description=panel.description,
|
|
96
|
+
title=title,
|
|
97
|
+
lastModified=ChangeAuditStampsClass(),
|
|
98
|
+
chartUrl=f"{base_url}/d/{dashboard.uid}?viewPanel={panel.id}",
|
|
99
|
+
customProperties=_build_custom_properties(panel),
|
|
100
|
+
inputs=input_datasets,
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Tags aspect
|
|
106
|
+
if dashboard.tags and ingest_tags:
|
|
107
|
+
tags = []
|
|
108
|
+
for tag in dashboard.tags:
|
|
109
|
+
if ":" in tag:
|
|
110
|
+
key, value = tag.split(":", 1)
|
|
111
|
+
tag_urn = make_tag_urn(f"{key}.{value}")
|
|
112
|
+
else:
|
|
113
|
+
tag_urn = make_tag_urn(tag)
|
|
114
|
+
tags.append(TagAssociationClass(tag=tag_urn))
|
|
115
|
+
|
|
116
|
+
if tags:
|
|
117
|
+
mcps.append(
|
|
118
|
+
MetadataChangeProposalWrapper(
|
|
119
|
+
entityUrn=chart_urn,
|
|
120
|
+
aspect=GlobalTagsClass(tags=tags),
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return ds_urn, chart_urn, mcps
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def build_dashboard_mcps(
|
|
128
|
+
dashboard: Dashboard,
|
|
129
|
+
platform: str,
|
|
130
|
+
platform_instance: Optional[str],
|
|
131
|
+
chart_urns: List[str],
|
|
132
|
+
base_url: str,
|
|
133
|
+
ingest_owners: bool,
|
|
134
|
+
ingest_tags: bool,
|
|
135
|
+
) -> Tuple[str, List[MetadataChangeProposalWrapper]]:
|
|
136
|
+
"""Build dashboard metadata change proposals"""
|
|
137
|
+
mcps = []
|
|
138
|
+
dashboard_urn = make_dashboard_urn(platform, dashboard.uid, platform_instance)
|
|
139
|
+
|
|
140
|
+
# Platform instance aspect
|
|
141
|
+
mcps.append(
|
|
142
|
+
MetadataChangeProposalWrapper(
|
|
143
|
+
entityUrn=dashboard_urn,
|
|
144
|
+
aspect=DataPlatformInstanceClass(
|
|
145
|
+
platform=make_data_platform_urn(platform),
|
|
146
|
+
instance=make_dataplatform_instance_urn(
|
|
147
|
+
platform=platform,
|
|
148
|
+
instance=platform_instance,
|
|
149
|
+
)
|
|
150
|
+
if platform_instance
|
|
151
|
+
else None,
|
|
152
|
+
),
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Dashboard info aspect
|
|
157
|
+
mcps.append(
|
|
158
|
+
MetadataChangeProposalWrapper(
|
|
159
|
+
entityUrn=dashboard_urn,
|
|
160
|
+
aspect=DashboardInfoClass(
|
|
161
|
+
description=dashboard.description,
|
|
162
|
+
title=dashboard.title,
|
|
163
|
+
charts=chart_urns,
|
|
164
|
+
lastModified=ChangeAuditStampsClass(),
|
|
165
|
+
dashboardUrl=f"{base_url}/d/{dashboard.uid}",
|
|
166
|
+
customProperties=_build_dashboard_properties(dashboard),
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Ownership aspect
|
|
172
|
+
if dashboard.uid and ingest_owners:
|
|
173
|
+
owner = _build_ownership(dashboard)
|
|
174
|
+
if owner:
|
|
175
|
+
mcps.append(
|
|
176
|
+
MetadataChangeProposalWrapper(
|
|
177
|
+
entityUrn=dashboard_urn,
|
|
178
|
+
aspect=owner,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Tags aspect
|
|
183
|
+
if dashboard.tags and ingest_tags:
|
|
184
|
+
tags = [TagAssociationClass(tag=make_tag_urn(tag)) for tag in dashboard.tags]
|
|
185
|
+
if tags:
|
|
186
|
+
mcps.append(
|
|
187
|
+
MetadataChangeProposalWrapper(
|
|
188
|
+
entityUrn=dashboard_urn,
|
|
189
|
+
aspect=GlobalTagsClass(tags=tags),
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Status aspect
|
|
194
|
+
mcps.append(
|
|
195
|
+
MetadataChangeProposalWrapper(
|
|
196
|
+
entityUrn=dashboard_urn,
|
|
197
|
+
aspect=StatusClass(removed=False),
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return dashboard_urn, mcps
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _build_custom_properties(panel: Panel) -> Dict[str, str]:
|
|
205
|
+
"""Build custom properties for chart"""
|
|
206
|
+
props = {}
|
|
207
|
+
|
|
208
|
+
if panel.type:
|
|
209
|
+
props["type"] = panel.type
|
|
210
|
+
|
|
211
|
+
if panel.datasource_ref:
|
|
212
|
+
props["datasourceType"] = panel.datasource_ref.type or ""
|
|
213
|
+
props["datasourceUid"] = panel.datasource_ref.uid or ""
|
|
214
|
+
|
|
215
|
+
for key in [
|
|
216
|
+
"description",
|
|
217
|
+
"format",
|
|
218
|
+
"pluginVersion",
|
|
219
|
+
"repeatDirection",
|
|
220
|
+
"maxDataPoints",
|
|
221
|
+
]:
|
|
222
|
+
value = getattr(panel, key, None)
|
|
223
|
+
if value:
|
|
224
|
+
props[key] = str(value)
|
|
225
|
+
|
|
226
|
+
if panel.query_targets:
|
|
227
|
+
props["targetsCount"] = str(len(panel.query_targets))
|
|
228
|
+
|
|
229
|
+
return props
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _build_dashboard_properties(dashboard: Dashboard) -> Dict[str, str]:
|
|
233
|
+
"""Build custom properties for dashboard"""
|
|
234
|
+
props = {}
|
|
235
|
+
|
|
236
|
+
if dashboard.timezone:
|
|
237
|
+
props["timezone"] = dashboard.timezone
|
|
238
|
+
|
|
239
|
+
if dashboard.schema_version:
|
|
240
|
+
props["schema_version"] = dashboard.schema_version
|
|
241
|
+
|
|
242
|
+
if dashboard.version:
|
|
243
|
+
props["version"] = dashboard.version
|
|
244
|
+
|
|
245
|
+
if dashboard.refresh:
|
|
246
|
+
props["refresh"] = dashboard.refresh
|
|
247
|
+
|
|
248
|
+
return props
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _build_ownership(dashboard: Dashboard) -> Optional[OwnershipClass]:
|
|
252
|
+
"""Build ownership information"""
|
|
253
|
+
owners = []
|
|
254
|
+
|
|
255
|
+
if dashboard.uid:
|
|
256
|
+
owners.append(
|
|
257
|
+
OwnerClass(
|
|
258
|
+
owner=make_user_urn(dashboard.uid),
|
|
259
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if dashboard.created_by:
|
|
264
|
+
owner_id = dashboard.created_by.split("@")[0]
|
|
265
|
+
owners.append(
|
|
266
|
+
OwnerClass(
|
|
267
|
+
owner=make_user_urn(owner_id),
|
|
268
|
+
type=OwnershipTypeClass.DATAOWNER,
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return OwnershipClass(owners=owners) if owners else None
|