acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
42
42
|
"max_overflow", self.config.profiling.max_workers
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
for db in tables
|
|
45
|
+
for db in tables:
|
|
46
46
|
profile_requests = []
|
|
47
|
-
for schema in tables.get(db, {})
|
|
47
|
+
for schema in tables.get(db, {}):
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
|
|
|
44
44
|
SELECT
|
|
45
45
|
schema_name,
|
|
46
46
|
schema_type,
|
|
47
|
-
schema_option,
|
|
47
|
+
cast(null as varchar(1024)) as schema_option,
|
|
48
48
|
cast(null as varchar(256)) as external_platform,
|
|
49
49
|
cast(null as varchar(256)) as external_database
|
|
50
50
|
FROM svv_redshift_schemas
|
|
@@ -89,7 +89,7 @@ class RedshiftCommonQuery:
|
|
|
89
89
|
) -> str:
|
|
90
90
|
# NOTE: it looks like description is available only in pg_description
|
|
91
91
|
# So this remains preferrred way
|
|
92
|
-
tables_query = """
|
|
92
|
+
tables_query = f"""
|
|
93
93
|
SELECT CASE c.relkind
|
|
94
94
|
WHEN 'r' THEN 'TABLE'
|
|
95
95
|
WHEN 'v' THEN 'VIEW'
|
|
@@ -120,6 +120,7 @@ class RedshiftCommonQuery:
|
|
|
120
120
|
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
|
|
121
121
|
LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
|
|
122
122
|
LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
|
|
123
|
+
JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database}'
|
|
123
124
|
WHERE c.relkind IN ('r','v','m','S','f')
|
|
124
125
|
AND n.nspname !~ '^pg_'
|
|
125
126
|
AND n.nspname != 'information_schema'
|
|
@@ -128,23 +129,24 @@ class RedshiftCommonQuery:
|
|
|
128
129
|
external_tables_query = f"""
|
|
129
130
|
SELECT 'EXTERNAL_TABLE' as tabletype,
|
|
130
131
|
NULL AS "schema_oid",
|
|
131
|
-
schemaname AS "schema",
|
|
132
|
+
t.schemaname AS "schema",
|
|
132
133
|
NULL AS "rel_oid",
|
|
133
|
-
tablename AS "relname",
|
|
134
|
+
t.tablename AS "relname",
|
|
134
135
|
NULL as "creation_time",
|
|
135
136
|
NULL AS "diststyle",
|
|
136
137
|
NULL AS "owner_id",
|
|
137
138
|
NULL AS "owner_name",
|
|
138
139
|
NULL AS "view_definition",
|
|
139
140
|
NULL AS "privileges",
|
|
140
|
-
"location",
|
|
141
|
-
parameters,
|
|
142
|
-
input_format,
|
|
143
|
-
output_format,
|
|
144
|
-
serde_parameters,
|
|
141
|
+
t."location",
|
|
142
|
+
t.parameters,
|
|
143
|
+
t.input_format,
|
|
144
|
+
t.output_format,
|
|
145
|
+
t.serde_parameters,
|
|
145
146
|
NULL as table_description
|
|
146
|
-
FROM pg_catalog.svv_external_tables
|
|
147
|
-
|
|
147
|
+
FROM pg_catalog.svv_external_tables t
|
|
148
|
+
JOIN SVV_EXTERNAL_SCHEMAS s ON t.schemaname = s.schemaname
|
|
149
|
+
WHERE t.redshift_database_name='{database}'
|
|
148
150
|
ORDER BY "schema",
|
|
149
151
|
"relname"
|
|
150
152
|
"""
|
|
@@ -232,11 +234,12 @@ class RedshiftCommonQuery:
|
|
|
232
234
|
ON att.attrelid = c.oid
|
|
233
235
|
LEFT JOIN pg_catalog.pg_attrdef ad
|
|
234
236
|
ON (att.attrelid, att.attnum) = (ad.adrelid, ad.adnum)
|
|
237
|
+
JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database_name}'
|
|
235
238
|
WHERE n.nspname !~ '^pg_'
|
|
236
239
|
AND n.nspname != 'information_schema'
|
|
237
240
|
AND att.attnum > 0
|
|
238
241
|
AND NOT att.attisdropped
|
|
239
|
-
and
|
|
242
|
+
and n.nspname = '{schema_name}'
|
|
240
243
|
UNION
|
|
241
244
|
SELECT
|
|
242
245
|
view_schema as "schema",
|
|
@@ -263,26 +266,27 @@ class RedshiftCommonQuery:
|
|
|
263
266
|
WHERE 1 and schema = '{schema_name}'
|
|
264
267
|
UNION
|
|
265
268
|
SELECT
|
|
266
|
-
schemaname as "schema",
|
|
267
|
-
tablename as "table_name",
|
|
268
|
-
columnname as "name",
|
|
269
|
+
c.schemaname as "schema",
|
|
270
|
+
c.tablename as "table_name",
|
|
271
|
+
c.columnname as "name",
|
|
269
272
|
null as "encode",
|
|
270
273
|
-- Spectrum represents data types differently.
|
|
271
274
|
-- Standardize, so we can infer types.
|
|
272
|
-
external_type AS "type",
|
|
275
|
+
c.external_type AS "type",
|
|
273
276
|
null as "distkey",
|
|
274
277
|
0 as "sortkey",
|
|
275
278
|
null as "notnull",
|
|
276
279
|
null as "comment",
|
|
277
280
|
null as "adsrc",
|
|
278
281
|
null as "attnum",
|
|
279
|
-
external_type AS "format_type",
|
|
282
|
+
c.external_type AS "format_type",
|
|
280
283
|
null as "default",
|
|
281
284
|
null as "schema_oid",
|
|
282
285
|
null as "table_oid"
|
|
283
|
-
FROM SVV_EXTERNAL_COLUMNS
|
|
284
|
-
|
|
285
|
-
|
|
286
|
+
FROM SVV_EXTERNAL_COLUMNS c
|
|
287
|
+
JOIN SVV_EXTERNAL_SCHEMAS s ON c.schemaname = s.schemaname
|
|
288
|
+
WHERE c.schemaname = '{schema_name}'
|
|
289
|
+
AND c.redshift_database_name = '{database_name}'
|
|
286
290
|
ORDER BY "schema", "table_name", "attnum"
|
|
287
291
|
"""
|
|
288
292
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import itertools
|
|
3
2
|
import logging
|
|
4
3
|
from collections import defaultdict
|
|
5
4
|
from typing import Dict, Iterable, List, Optional, Type, Union
|
|
@@ -10,6 +9,7 @@ import humanfriendly
|
|
|
10
9
|
import pydantic
|
|
11
10
|
import redshift_connector
|
|
12
11
|
|
|
12
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
13
13
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
14
14
|
from datahub.emitter.mce_builder import (
|
|
15
15
|
make_data_platform_urn,
|
|
@@ -46,12 +46,12 @@ from datahub.ingestion.source.common.data_reader import DataReader
|
|
|
46
46
|
from datahub.ingestion.source.common.subtypes import (
|
|
47
47
|
DatasetContainerSubTypes,
|
|
48
48
|
DatasetSubTypes,
|
|
49
|
+
SourceCapabilityModifier,
|
|
49
50
|
)
|
|
50
51
|
from datahub.ingestion.source.redshift.config import RedshiftConfig
|
|
51
52
|
from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
|
|
52
53
|
from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
|
|
53
|
-
from datahub.ingestion.source.redshift.lineage import
|
|
54
|
-
from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
|
|
54
|
+
from datahub.ingestion.source.redshift.lineage import RedshiftSqlLineage
|
|
55
55
|
from datahub.ingestion.source.redshift.profile import RedshiftProfiler
|
|
56
56
|
from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
|
|
57
57
|
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
@@ -70,7 +70,6 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
70
70
|
add_table_to_schema_container,
|
|
71
71
|
gen_database_container,
|
|
72
72
|
gen_database_key,
|
|
73
|
-
gen_lineage,
|
|
74
73
|
gen_schema_container,
|
|
75
74
|
gen_schema_key,
|
|
76
75
|
get_dataplatform_instance_aspect,
|
|
@@ -90,8 +89,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
90
89
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
91
90
|
LINEAGE_EXTRACTION,
|
|
92
91
|
METADATA_EXTRACTION,
|
|
93
|
-
PROFILING,
|
|
94
92
|
USAGE_EXTRACTION_INGESTION,
|
|
93
|
+
IngestionHighStage,
|
|
95
94
|
)
|
|
96
95
|
from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes, TimeStamp
|
|
97
96
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
@@ -114,7 +113,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
114
113
|
)
|
|
115
114
|
from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
|
|
116
115
|
from datahub.utilities import memory_footprint
|
|
117
|
-
from datahub.utilities.dedup_list import deduplicate_list
|
|
118
116
|
from datahub.utilities.mapping import Constants
|
|
119
117
|
from datahub.utilities.perf_timer import PerfTimer
|
|
120
118
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -125,7 +123,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
125
123
|
@platform_name("Redshift")
|
|
126
124
|
@config_class(RedshiftConfig)
|
|
127
125
|
@support_status(SupportStatus.CERTIFIED)
|
|
128
|
-
@capability(
|
|
126
|
+
@capability(
|
|
127
|
+
SourceCapability.CONTAINERS,
|
|
128
|
+
"Enabled by default",
|
|
129
|
+
subtype_modifier=[
|
|
130
|
+
SourceCapabilityModifier.DATABASE,
|
|
131
|
+
SourceCapabilityModifier.SCHEMA,
|
|
132
|
+
],
|
|
133
|
+
)
|
|
129
134
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
130
135
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
131
136
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
@@ -138,14 +143,17 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
138
143
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
139
144
|
@capability(
|
|
140
145
|
SourceCapability.USAGE_STATS,
|
|
141
|
-
"
|
|
146
|
+
"Optionally enabled via `include_usage_statistics`",
|
|
147
|
+
)
|
|
148
|
+
@capability(
|
|
149
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
142
150
|
)
|
|
143
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
144
151
|
@capability(
|
|
145
152
|
SourceCapability.CLASSIFICATION,
|
|
146
153
|
"Optionally enabled via `classification.enabled`",
|
|
147
154
|
supported=True,
|
|
148
155
|
)
|
|
156
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
149
157
|
class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
150
158
|
"""
|
|
151
159
|
This plugin extracts the following:
|
|
@@ -354,7 +362,23 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
354
362
|
).workunit_processor,
|
|
355
363
|
]
|
|
356
364
|
|
|
365
|
+
def _warn_deprecated_configs(self):
|
|
366
|
+
if (
|
|
367
|
+
self.config.match_fully_qualified_names is not None
|
|
368
|
+
and not self.config.match_fully_qualified_names
|
|
369
|
+
and self.config.schema_pattern is not None
|
|
370
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
371
|
+
):
|
|
372
|
+
self.report.report_warning(
|
|
373
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
374
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
375
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
376
|
+
context="Config option deprecation warning",
|
|
377
|
+
title="Config option deprecation warning",
|
|
378
|
+
)
|
|
379
|
+
|
|
357
380
|
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
381
|
+
self._warn_deprecated_configs()
|
|
358
382
|
connection = self._try_get_redshift_connection(self.config)
|
|
359
383
|
|
|
360
384
|
if connection is None:
|
|
@@ -395,40 +419,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
395
419
|
memory_footprint.total_size(self.db_views)
|
|
396
420
|
)
|
|
397
421
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
)
|
|
409
|
-
|
|
410
|
-
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
411
|
-
yield from self.extract_lineage_v2(
|
|
412
|
-
connection=connection,
|
|
413
|
-
database=database,
|
|
414
|
-
lineage_extractor=lineage_extractor,
|
|
415
|
-
)
|
|
416
|
-
|
|
417
|
-
all_tables = self.get_all_tables()
|
|
418
|
-
else:
|
|
419
|
-
yield from self.process_schemas(connection, database)
|
|
422
|
+
with RedshiftSqlLineage(
|
|
423
|
+
config=self.config,
|
|
424
|
+
report=self.report,
|
|
425
|
+
context=self.ctx,
|
|
426
|
+
database=database,
|
|
427
|
+
redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
|
|
428
|
+
) as lineage_extractor:
|
|
429
|
+
yield from lineage_extractor.aggregator.register_schemas_from_stream(
|
|
430
|
+
self.process_schemas(connection, database)
|
|
431
|
+
)
|
|
420
432
|
|
|
421
|
-
|
|
433
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
434
|
+
yield from self.extract_lineage_v2(
|
|
435
|
+
connection=connection,
|
|
436
|
+
database=database,
|
|
437
|
+
lineage_extractor=lineage_extractor,
|
|
438
|
+
)
|
|
422
439
|
|
|
423
|
-
|
|
424
|
-
self.config.include_table_lineage
|
|
425
|
-
or self.config.include_view_lineage
|
|
426
|
-
or self.config.include_copy_lineage
|
|
427
|
-
):
|
|
428
|
-
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
429
|
-
yield from self.extract_lineage(
|
|
430
|
-
connection=connection, all_tables=all_tables, database=database
|
|
431
|
-
)
|
|
440
|
+
all_tables = self.get_all_tables()
|
|
432
441
|
|
|
433
442
|
if self.config.include_usage_statistics:
|
|
434
443
|
with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
|
|
@@ -437,7 +446,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
437
446
|
)
|
|
438
447
|
|
|
439
448
|
if self.config.is_profiling_enabled():
|
|
440
|
-
with self.report.
|
|
449
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
441
450
|
profiler = RedshiftProfiler(
|
|
442
451
|
config=self.config,
|
|
443
452
|
report=self.report,
|
|
@@ -940,45 +949,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
940
949
|
|
|
941
950
|
self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
|
|
942
951
|
|
|
943
|
-
def extract_lineage(
|
|
944
|
-
self,
|
|
945
|
-
connection: redshift_connector.Connection,
|
|
946
|
-
database: str,
|
|
947
|
-
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
948
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
949
|
-
if not self._should_ingest_lineage():
|
|
950
|
-
return
|
|
951
|
-
|
|
952
|
-
lineage_extractor = RedshiftLineageExtractor(
|
|
953
|
-
config=self.config,
|
|
954
|
-
report=self.report,
|
|
955
|
-
context=self.ctx,
|
|
956
|
-
redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
|
|
957
|
-
)
|
|
958
|
-
|
|
959
|
-
with PerfTimer() as timer:
|
|
960
|
-
lineage_extractor.populate_lineage(
|
|
961
|
-
database=database, connection=connection, all_tables=all_tables
|
|
962
|
-
)
|
|
963
|
-
|
|
964
|
-
self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
|
|
965
|
-
digits=2
|
|
966
|
-
)
|
|
967
|
-
yield from self.generate_lineage(
|
|
968
|
-
database, lineage_extractor=lineage_extractor
|
|
969
|
-
)
|
|
970
|
-
|
|
971
|
-
if self.redundant_lineage_run_skip_handler:
|
|
972
|
-
# Update the checkpoint state for this run.
|
|
973
|
-
self.redundant_lineage_run_skip_handler.update_state(
|
|
974
|
-
self.config.start_time, self.config.end_time
|
|
975
|
-
)
|
|
976
|
-
|
|
977
952
|
def extract_lineage_v2(
|
|
978
953
|
self,
|
|
979
954
|
connection: redshift_connector.Connection,
|
|
980
955
|
database: str,
|
|
981
|
-
lineage_extractor:
|
|
956
|
+
lineage_extractor: RedshiftSqlLineage,
|
|
982
957
|
) -> Iterable[MetadataWorkUnit]:
|
|
983
958
|
if self.config.include_share_lineage:
|
|
984
959
|
outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
|
|
@@ -1041,40 +1016,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1041
1016
|
|
|
1042
1017
|
return True
|
|
1043
1018
|
|
|
1044
|
-
def generate_lineage(
|
|
1045
|
-
self, database: str, lineage_extractor: RedshiftLineageExtractor
|
|
1046
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
1047
|
-
logger.info(f"Generate lineage for {database}")
|
|
1048
|
-
for schema in deduplicate_list(
|
|
1049
|
-
itertools.chain(self.db_tables[database], self.db_views[database])
|
|
1050
|
-
):
|
|
1051
|
-
if (
|
|
1052
|
-
database not in self.db_schemas
|
|
1053
|
-
or schema not in self.db_schemas[database]
|
|
1054
|
-
):
|
|
1055
|
-
logger.warning(
|
|
1056
|
-
f"Either database {database} or {schema} exists in the lineage but was not discovered earlier. Something went wrong."
|
|
1057
|
-
)
|
|
1058
|
-
continue
|
|
1059
|
-
|
|
1060
|
-
table_or_view: Union[RedshiftTable, RedshiftView]
|
|
1061
|
-
for table_or_view in (
|
|
1062
|
-
[]
|
|
1063
|
-
+ self.db_tables[database].get(schema, [])
|
|
1064
|
-
+ self.db_views[database].get(schema, [])
|
|
1065
|
-
):
|
|
1066
|
-
datahub_dataset_name = f"{database}.{schema}.{table_or_view.name}"
|
|
1067
|
-
dataset_urn = self.gen_dataset_urn(datahub_dataset_name)
|
|
1068
|
-
|
|
1069
|
-
lineage_info = lineage_extractor.get_lineage(
|
|
1070
|
-
table_or_view,
|
|
1071
|
-
dataset_urn,
|
|
1072
|
-
self.db_schemas[database][schema],
|
|
1073
|
-
)
|
|
1074
|
-
if lineage_info:
|
|
1075
|
-
# incremental lineage generation is taken care by auto_incremental_lineage
|
|
1076
|
-
yield from gen_lineage(dataset_urn, lineage_info)
|
|
1077
|
-
|
|
1078
1019
|
def add_config_to_report(self):
|
|
1079
1020
|
self.report.stateful_lineage_ingestion_enabled = (
|
|
1080
1021
|
self.config.enable_stateful_lineage_ingestion
|
|
@@ -15,6 +15,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable
|
|
|
15
15
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
16
16
|
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
|
|
17
17
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
18
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
18
19
|
|
|
19
20
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
@@ -243,9 +244,13 @@ class RedshiftDataDictionary:
|
|
|
243
244
|
conn: redshift_connector.Connection, query: str
|
|
244
245
|
) -> redshift_connector.Cursor:
|
|
245
246
|
cursor: redshift_connector.Cursor = conn.cursor()
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
247
|
+
with PerfTimer() as timer:
|
|
248
|
+
query_hash_id = hash(query)
|
|
249
|
+
logger.info(f"Executing query [{query_hash_id}]\n{query}")
|
|
250
|
+
cursor.execute(query)
|
|
251
|
+
logger.info(
|
|
252
|
+
f"Time taken query [{query_hash_id}: {timer.elapsed_seconds():.3f} seconds"
|
|
253
|
+
)
|
|
249
254
|
return cursor
|
|
250
255
|
|
|
251
256
|
@staticmethod
|
|
@@ -545,8 +550,7 @@ class RedshiftDataDictionary:
|
|
|
545
550
|
conn: redshift_connector.Connection,
|
|
546
551
|
query: str,
|
|
547
552
|
) -> Iterable[LineageRow]:
|
|
548
|
-
cursor =
|
|
549
|
-
cursor.execute(query)
|
|
553
|
+
cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
|
|
550
554
|
field_names = [i[0] for i in cursor.description]
|
|
551
555
|
|
|
552
556
|
rows = cursor.fetchmany()
|
|
@@ -603,9 +607,7 @@ class RedshiftDataDictionary:
|
|
|
603
607
|
conn: redshift_connector.Connection,
|
|
604
608
|
query: str,
|
|
605
609
|
) -> Iterable[TempTableRow]:
|
|
606
|
-
cursor =
|
|
607
|
-
|
|
608
|
-
cursor.execute(query)
|
|
610
|
+
cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
|
|
609
611
|
|
|
610
612
|
field_names = [i[0] for i in cursor.description]
|
|
611
613
|
|
|
@@ -662,8 +664,9 @@ class RedshiftDataDictionary:
|
|
|
662
664
|
def get_outbound_datashares(
|
|
663
665
|
conn: redshift_connector.Connection,
|
|
664
666
|
) -> Iterable[OutboundDatashare]:
|
|
665
|
-
cursor =
|
|
666
|
-
|
|
667
|
+
cursor = RedshiftDataDictionary.get_query_result(
|
|
668
|
+
conn=conn, query=RedshiftCommonQuery.list_outbound_datashares()
|
|
669
|
+
)
|
|
667
670
|
for item in cursor.fetchall():
|
|
668
671
|
yield OutboundDatashare(
|
|
669
672
|
share_name=item[1],
|
|
@@ -678,8 +681,10 @@ class RedshiftDataDictionary:
|
|
|
678
681
|
conn: redshift_connector.Connection,
|
|
679
682
|
database: str,
|
|
680
683
|
) -> Optional[InboundDatashare]:
|
|
681
|
-
cursor =
|
|
682
|
-
|
|
684
|
+
cursor = RedshiftDataDictionary.get_query_result(
|
|
685
|
+
conn=conn,
|
|
686
|
+
query=RedshiftCommonQuery.get_inbound_datashare(database),
|
|
687
|
+
)
|
|
683
688
|
item = cursor.fetchone()
|
|
684
689
|
if item:
|
|
685
690
|
return InboundDatashare(
|
|
@@ -4,7 +4,6 @@ from typing import Dict, Optional
|
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
6
6
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
7
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
8
7
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
9
8
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
10
9
|
from datahub.utilities.lossy_collections import LossyDict
|
|
@@ -15,7 +14,6 @@ from datahub.utilities.stats_collections import TopKDict
|
|
|
15
14
|
@dataclass
|
|
16
15
|
class RedshiftReport(
|
|
17
16
|
SQLSourceReport,
|
|
18
|
-
IngestionStageReport,
|
|
19
17
|
BaseTimeWindowReport,
|
|
20
18
|
ClassificationReportMixin,
|
|
21
19
|
):
|
|
@@ -25,6 +25,7 @@ from datahub.ingestion.source.redshift.query import (
|
|
|
25
25
|
RedshiftServerlessQuery,
|
|
26
26
|
)
|
|
27
27
|
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
28
|
+
RedshiftDataDictionary,
|
|
28
29
|
RedshiftTable,
|
|
29
30
|
RedshiftView,
|
|
30
31
|
)
|
|
@@ -182,15 +183,17 @@ class RedshiftUsageExtractor:
|
|
|
182
183
|
self.report.num_operational_stats_filtered = 0
|
|
183
184
|
|
|
184
185
|
if self.config.include_operational_stats:
|
|
185
|
-
with
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
self.
|
|
192
|
-
|
|
193
|
-
|
|
186
|
+
with (
|
|
187
|
+
self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
|
|
188
|
+
PerfTimer() as timer,
|
|
189
|
+
):
|
|
190
|
+
# Generate operation aspect workunits
|
|
191
|
+
yield from self._gen_operation_aspect_workunits(
|
|
192
|
+
self.connection, all_tables
|
|
193
|
+
)
|
|
194
|
+
self.report.operational_metadata_extraction_sec[
|
|
195
|
+
self.config.database
|
|
196
|
+
] = timer.elapsed_seconds(digits=2)
|
|
194
197
|
|
|
195
198
|
# Generate aggregate events
|
|
196
199
|
with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
|
|
@@ -261,8 +264,7 @@ class RedshiftUsageExtractor:
|
|
|
261
264
|
connection: redshift_connector.Connection,
|
|
262
265
|
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
263
266
|
) -> Iterable[RedshiftAccessEvent]:
|
|
264
|
-
cursor =
|
|
265
|
-
cursor.execute(query)
|
|
267
|
+
cursor = RedshiftDataDictionary.get_query_result(conn=connection, query=query)
|
|
266
268
|
results = cursor.fetchmany()
|
|
267
269
|
field_names = [i[0] for i in cursor.description]
|
|
268
270
|
while results:
|
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
5
|
StaleEntityRemovalSourceReport,
|
|
7
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclasses.dataclass
|
|
11
11
|
class DataLakeSourceReport(StaleEntityRemovalSourceReport):
|
|
12
12
|
files_scanned = 0
|
|
13
|
-
filtered:
|
|
13
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
14
|
+
number_of_files_filtered: int = 0
|
|
14
15
|
|
|
15
16
|
def report_file_scanned(self) -> None:
|
|
16
17
|
self.files_scanned += 1
|
|
17
18
|
|
|
18
19
|
def report_file_dropped(self, file: str) -> None:
|
|
19
20
|
self.filtered.append(file)
|
|
21
|
+
self.number_of_files_filtered += 1
|