acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
# This import verifies that the dependencies are available.
|
|
2
|
+
import logging
|
|
3
|
+
from typing import TYPE_CHECKING, Any, List, Optional
|
|
2
4
|
|
|
3
5
|
import pymysql # noqa: F401
|
|
4
6
|
from pydantic.fields import Field
|
|
5
|
-
from sqlalchemy import util
|
|
7
|
+
from sqlalchemy import create_engine, event, inspect, util
|
|
6
8
|
from sqlalchemy.dialects.mysql import BIT, base
|
|
7
9
|
from sqlalchemy.dialects.mysql.enumerated import SET
|
|
8
10
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from sqlalchemy.engine import Engine
|
|
14
|
+
|
|
15
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
10
16
|
from datahub.ingestion.api.decorators import (
|
|
11
17
|
SourceCapability,
|
|
12
18
|
SupportStatus,
|
|
@@ -15,16 +21,27 @@ from datahub.ingestion.api.decorators import (
|
|
|
15
21
|
platform_name,
|
|
16
22
|
support_status,
|
|
17
23
|
)
|
|
24
|
+
from datahub.ingestion.source.aws.aws_common import (
|
|
25
|
+
AwsConnectionConfig,
|
|
26
|
+
RDSIAMTokenManager,
|
|
27
|
+
)
|
|
18
28
|
from datahub.ingestion.source.sql.sql_common import (
|
|
19
29
|
make_sqlalchemy_type,
|
|
20
30
|
register_custom_type,
|
|
21
31
|
)
|
|
22
32
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
33
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
|
|
34
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
35
|
+
BaseProcedure,
|
|
36
|
+
)
|
|
23
37
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
24
38
|
TwoTierSQLAlchemyConfig,
|
|
25
39
|
TwoTierSQLAlchemySource,
|
|
26
40
|
)
|
|
27
41
|
from datahub.metadata.schema_classes import BytesTypeClass
|
|
42
|
+
from datahub.utilities.str_enum import StrEnum
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
28
45
|
|
|
29
46
|
SET.__repr__ = util.generic_repr # type:ignore
|
|
30
47
|
|
|
@@ -48,16 +65,49 @@ base.ischema_names["polygon"] = POLYGON
|
|
|
48
65
|
base.ischema_names["decimal128"] = DECIMAL128
|
|
49
66
|
|
|
50
67
|
|
|
68
|
+
class MySQLAuthMode(StrEnum):
|
|
69
|
+
"""Authentication mode for MySQL connection."""
|
|
70
|
+
|
|
71
|
+
PASSWORD = "PASSWORD"
|
|
72
|
+
AWS_IAM = "AWS_IAM"
|
|
73
|
+
|
|
74
|
+
|
|
51
75
|
class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
|
|
52
76
|
# defaults
|
|
53
77
|
host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
|
|
54
|
-
scheme: str = "mysql+pymysql"
|
|
78
|
+
scheme: HiddenFromDocs[str] = "mysql+pymysql"
|
|
79
|
+
|
|
80
|
+
# Authentication configuration
|
|
81
|
+
auth_mode: MySQLAuthMode = Field(
|
|
82
|
+
default=MySQLAuthMode.PASSWORD,
|
|
83
|
+
description="Authentication mode to use for the MySQL connection. "
|
|
84
|
+
"Options are 'PASSWORD' (default) for standard username/password authentication, "
|
|
85
|
+
"or 'AWS_IAM' for AWS RDS IAM authentication.",
|
|
86
|
+
)
|
|
87
|
+
aws_config: AwsConnectionConfig = Field(
|
|
88
|
+
default_factory=AwsConnectionConfig,
|
|
89
|
+
description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
|
|
90
|
+
"Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
|
|
91
|
+
"If not explicitly configured, boto3 will automatically use the default credential chain and region from "
|
|
92
|
+
"environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
|
|
93
|
+
)
|
|
55
94
|
|
|
56
95
|
|
|
57
96
|
class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
58
97
|
def get_identifier(self, *, schema: str, table: str) -> str:
|
|
59
98
|
return f"{schema}.{table}"
|
|
60
99
|
|
|
100
|
+
include_stored_procedures: bool = Field(
|
|
101
|
+
default=True,
|
|
102
|
+
description="Include ingest of stored procedures.",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
106
|
+
default=AllowDenyPattern.allow_all(),
|
|
107
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
108
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
109
|
+
)
|
|
110
|
+
|
|
61
111
|
|
|
62
112
|
@platform_name("MySQL")
|
|
63
113
|
@config_class(MySQLConfig)
|
|
@@ -65,7 +115,6 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
|
65
115
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
66
116
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
67
117
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
68
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
69
118
|
class MySQLSource(TwoTierSQLAlchemySource):
|
|
70
119
|
"""
|
|
71
120
|
This plugin extracts the following:
|
|
@@ -75,9 +124,27 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
75
124
|
Table, row, and column statistics via optional SQL profiling
|
|
76
125
|
"""
|
|
77
126
|
|
|
78
|
-
|
|
127
|
+
config: MySQLConfig
|
|
128
|
+
|
|
129
|
+
def __init__(self, config: MySQLConfig, ctx: Any):
|
|
79
130
|
super().__init__(config, ctx, self.get_platform())
|
|
80
131
|
|
|
132
|
+
self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
|
|
133
|
+
if config.auth_mode == MySQLAuthMode.AWS_IAM:
|
|
134
|
+
hostname, port = parse_host_port(config.host_port, default_port=3306)
|
|
135
|
+
if port is None:
|
|
136
|
+
raise ValueError("Port must be specified for RDS IAM authentication")
|
|
137
|
+
|
|
138
|
+
if not config.username:
|
|
139
|
+
raise ValueError("username is required for RDS IAM authentication")
|
|
140
|
+
|
|
141
|
+
self._rds_iam_token_manager = RDSIAMTokenManager(
|
|
142
|
+
endpoint=hostname,
|
|
143
|
+
username=config.username,
|
|
144
|
+
port=port,
|
|
145
|
+
aws_config=config.aws_config,
|
|
146
|
+
)
|
|
147
|
+
|
|
81
148
|
def get_platform(self):
|
|
82
149
|
return "mysql"
|
|
83
150
|
|
|
@@ -86,6 +153,52 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
86
153
|
config = MySQLConfig.parse_obj(config_dict)
|
|
87
154
|
return cls(config, ctx)
|
|
88
155
|
|
|
156
|
+
def _setup_rds_iam_event_listener(
|
|
157
|
+
self, engine: "Engine", database_name: Optional[str] = None
|
|
158
|
+
) -> None:
|
|
159
|
+
"""Setup SQLAlchemy event listener to inject RDS IAM tokens."""
|
|
160
|
+
if not (
|
|
161
|
+
self.config.auth_mode == MySQLAuthMode.AWS_IAM
|
|
162
|
+
and self._rds_iam_token_manager
|
|
163
|
+
):
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
|
|
167
|
+
if not self._rds_iam_token_manager:
|
|
168
|
+
raise RuntimeError("RDS IAM Token Manager is not initialized")
|
|
169
|
+
cparams["password"] = self._rds_iam_token_manager.get_token()
|
|
170
|
+
# PyMySQL requires SSL to be enabled for RDS IAM authentication.
|
|
171
|
+
# Preserve any existing SSL configuration, otherwise enable with default settings.
|
|
172
|
+
# The {"ssl": True} dict is a workaround to make PyMySQL recognize that SSL
|
|
173
|
+
# should be enabled, since the library requires a truthy value in the ssl parameter.
|
|
174
|
+
# See https://pymysql.readthedocs.io/en/latest/modules/connections.html#pymysql.connections.Connection
|
|
175
|
+
cparams["ssl"] = cparams.get("ssl") or {"ssl": True}
|
|
176
|
+
|
|
177
|
+
event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
|
|
178
|
+
|
|
179
|
+
def get_inspectors(self):
|
|
180
|
+
url = self.config.get_sql_alchemy_url()
|
|
181
|
+
logger.debug(f"sql_alchemy_url={url}")
|
|
182
|
+
|
|
183
|
+
engine = create_engine(url, **self.config.options)
|
|
184
|
+
self._setup_rds_iam_event_listener(engine)
|
|
185
|
+
|
|
186
|
+
with engine.connect() as conn:
|
|
187
|
+
inspector = inspect(conn)
|
|
188
|
+
if self.config.database and self.config.database != "":
|
|
189
|
+
databases = [self.config.database]
|
|
190
|
+
else:
|
|
191
|
+
databases = inspector.get_schema_names()
|
|
192
|
+
for db in databases:
|
|
193
|
+
if self.config.database_pattern.allowed(db):
|
|
194
|
+
url = self.config.get_sql_alchemy_url(current_db=db)
|
|
195
|
+
db_engine = create_engine(url, **self.config.options)
|
|
196
|
+
self._setup_rds_iam_event_listener(db_engine, database_name=db)
|
|
197
|
+
|
|
198
|
+
with db_engine.connect() as conn:
|
|
199
|
+
inspector = inspect(conn)
|
|
200
|
+
yield inspector
|
|
201
|
+
|
|
89
202
|
def add_profile_metadata(self, inspector: Inspector) -> None:
|
|
90
203
|
if not self.config.is_profiling_enabled():
|
|
91
204
|
return
|
|
@@ -96,3 +209,40 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
96
209
|
self.profile_metadata_info.dataset_name_to_storage_bytes[
|
|
97
210
|
f"{row.TABLE_SCHEMA}.{row.TABLE_NAME}"
|
|
98
211
|
] = row.DATA_LENGTH
|
|
212
|
+
|
|
213
|
+
def get_procedures_for_schema(
|
|
214
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
215
|
+
) -> List[BaseProcedure]:
|
|
216
|
+
"""
|
|
217
|
+
Get stored procedures for a specific schema.
|
|
218
|
+
"""
|
|
219
|
+
base_procedures = []
|
|
220
|
+
with inspector.engine.connect() as conn:
|
|
221
|
+
procedures = conn.execute(
|
|
222
|
+
"""
|
|
223
|
+
SELECT ROUTINE_NAME AS name,
|
|
224
|
+
ROUTINE_DEFINITION AS definition,
|
|
225
|
+
EXTERNAL_LANGUAGE AS language
|
|
226
|
+
FROM information_schema.ROUTINES
|
|
227
|
+
WHERE ROUTINE_TYPE = 'PROCEDURE'
|
|
228
|
+
AND ROUTINE_SCHEMA = %s
|
|
229
|
+
""",
|
|
230
|
+
(schema,),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
procedure_rows = list(procedures)
|
|
234
|
+
for row in procedure_rows:
|
|
235
|
+
base_procedures.append(
|
|
236
|
+
BaseProcedure(
|
|
237
|
+
name=row.name,
|
|
238
|
+
language=row.language,
|
|
239
|
+
argument_signature=None,
|
|
240
|
+
return_type=None,
|
|
241
|
+
procedure_definition=row.definition,
|
|
242
|
+
created=None,
|
|
243
|
+
last_altered=None,
|
|
244
|
+
extra_properties=None,
|
|
245
|
+
comment=None,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
return base_procedures
|
|
@@ -37,7 +37,7 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
|
-
oracledb.version = "8.3.0"
|
|
40
|
+
oracledb.version = "8.3.0" # type: ignore[assignment]
|
|
41
41
|
sys.modules["cx_Oracle"] = oracledb
|
|
42
42
|
|
|
43
43
|
extra_oracle_types = {
|
|
@@ -110,10 +110,10 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
110
110
|
return v
|
|
111
111
|
|
|
112
112
|
@pydantic.validator("data_dictionary_mode")
|
|
113
|
-
def check_data_dictionary_mode(cls,
|
|
114
|
-
if
|
|
113
|
+
def check_data_dictionary_mode(cls, value):
|
|
114
|
+
if value not in ("ALL", "DBA"):
|
|
115
115
|
raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
|
|
116
|
-
return
|
|
116
|
+
return value
|
|
117
117
|
|
|
118
118
|
@pydantic.validator("thick_mode_lib_dir", always=True)
|
|
119
119
|
def check_thick_mode_lib_dir(cls, v, values):
|
|
@@ -441,7 +441,7 @@ class OracleInspectorObjectWrapper:
|
|
|
441
441
|
"\nac.constraint_name,"
|
|
442
442
|
"\nac.constraint_type,"
|
|
443
443
|
"\nacc.column_name AS local_column,"
|
|
444
|
-
"\nac.
|
|
444
|
+
"\nac.table_name AS remote_table,"
|
|
445
445
|
"\nrcc.column_name AS remote_column,"
|
|
446
446
|
"\nac.r_owner AS remote_owner,"
|
|
447
447
|
"\nacc.position AS loc_pos,"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
# This import verifies that the dependencies are available.
|
|
6
6
|
import psycopg2 # noqa: F401
|
|
@@ -14,9 +14,12 @@ import sqlalchemy.dialects.postgresql as custom_types
|
|
|
14
14
|
from geoalchemy2 import Geometry # noqa: F401
|
|
15
15
|
from pydantic import BaseModel
|
|
16
16
|
from pydantic.fields import Field
|
|
17
|
-
from sqlalchemy import create_engine, inspect
|
|
17
|
+
from sqlalchemy import create_engine, event, inspect
|
|
18
18
|
from sqlalchemy.engine.reflection import Inspector
|
|
19
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from sqlalchemy.engine import Engine
|
|
22
|
+
|
|
20
23
|
from datahub.configuration.common import AllowDenyPattern
|
|
21
24
|
from datahub.emitter import mce_builder
|
|
22
25
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
@@ -30,17 +33,26 @@ from datahub.ingestion.api.decorators import (
|
|
|
30
33
|
support_status,
|
|
31
34
|
)
|
|
32
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
|
+
from datahub.ingestion.source.aws.aws_common import (
|
|
37
|
+
AwsConnectionConfig,
|
|
38
|
+
RDSIAMTokenManager,
|
|
39
|
+
)
|
|
33
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
34
41
|
SQLAlchemySource,
|
|
35
42
|
SqlWorkUnit,
|
|
36
43
|
register_custom_type,
|
|
37
44
|
)
|
|
38
45
|
from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
|
|
46
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
|
|
47
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
48
|
+
BaseProcedure,
|
|
49
|
+
)
|
|
39
50
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
40
51
|
ArrayTypeClass,
|
|
41
52
|
BytesTypeClass,
|
|
42
53
|
MapTypeClass,
|
|
43
54
|
)
|
|
55
|
+
from datahub.utilities.str_enum import StrEnum
|
|
44
56
|
|
|
45
57
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
46
58
|
|
|
@@ -97,12 +109,34 @@ class ViewLineageEntry(BaseModel):
|
|
|
97
109
|
dependent_schema: str
|
|
98
110
|
|
|
99
111
|
|
|
112
|
+
class PostgresAuthMode(StrEnum):
|
|
113
|
+
"""Authentication mode for PostgreSQL connection."""
|
|
114
|
+
|
|
115
|
+
PASSWORD = "PASSWORD"
|
|
116
|
+
AWS_IAM = "AWS_IAM"
|
|
117
|
+
|
|
118
|
+
|
|
100
119
|
class BasePostgresConfig(BasicSQLAlchemyConfig):
|
|
101
120
|
scheme: str = Field(default="postgresql+psycopg2", description="database scheme")
|
|
102
121
|
schema_pattern: AllowDenyPattern = Field(
|
|
103
122
|
default=AllowDenyPattern(deny=["information_schema"])
|
|
104
123
|
)
|
|
105
124
|
|
|
125
|
+
# Authentication configuration
|
|
126
|
+
auth_mode: PostgresAuthMode = Field(
|
|
127
|
+
default=PostgresAuthMode.PASSWORD,
|
|
128
|
+
description="Authentication mode to use for the PostgreSQL connection. "
|
|
129
|
+
"Options are 'PASSWORD' (default) for standard username/password authentication, "
|
|
130
|
+
"or 'AWS_IAM' for AWS RDS IAM authentication.",
|
|
131
|
+
)
|
|
132
|
+
aws_config: AwsConnectionConfig = Field(
|
|
133
|
+
default_factory=AwsConnectionConfig,
|
|
134
|
+
description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
|
|
135
|
+
"Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
|
|
136
|
+
"If not explicitly configured, boto3 will automatically use the default credential chain and region from "
|
|
137
|
+
"environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
|
|
138
|
+
)
|
|
139
|
+
|
|
106
140
|
|
|
107
141
|
class PostgresConfig(BasePostgresConfig):
|
|
108
142
|
database_pattern: AllowDenyPattern = Field(
|
|
@@ -124,6 +158,17 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
124
158
|
),
|
|
125
159
|
)
|
|
126
160
|
|
|
161
|
+
include_stored_procedures: bool = Field(
|
|
162
|
+
default=True,
|
|
163
|
+
description="Include ingest of stored procedures.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
167
|
+
default=AllowDenyPattern.allow_all(),
|
|
168
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
169
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
170
|
+
)
|
|
171
|
+
|
|
127
172
|
|
|
128
173
|
@platform_name("Postgres")
|
|
129
174
|
@config_class(PostgresConfig)
|
|
@@ -131,12 +176,11 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
131
176
|
@capability(SourceCapability.DOMAINS, "Enabled by default")
|
|
132
177
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
133
178
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
134
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
|
|
135
179
|
class PostgresSource(SQLAlchemySource):
|
|
136
180
|
"""
|
|
137
181
|
This plugin extracts the following:
|
|
138
182
|
|
|
139
|
-
- Metadata for databases, schemas, views, and
|
|
183
|
+
- Metadata for databases, schemas, views, tables, and stored procedures
|
|
140
184
|
- Column types associated with each table
|
|
141
185
|
- Also supports PostGIS extensions
|
|
142
186
|
- Table, row, and column statistics via optional SQL profiling
|
|
@@ -147,6 +191,22 @@ class PostgresSource(SQLAlchemySource):
|
|
|
147
191
|
def __init__(self, config: PostgresConfig, ctx: PipelineContext):
|
|
148
192
|
super().__init__(config, ctx, self.get_platform())
|
|
149
193
|
|
|
194
|
+
self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
|
|
195
|
+
if config.auth_mode == PostgresAuthMode.AWS_IAM:
|
|
196
|
+
hostname, port = parse_host_port(config.host_port, default_port=5432)
|
|
197
|
+
if port is None:
|
|
198
|
+
raise ValueError("Port must be specified for RDS IAM authentication")
|
|
199
|
+
|
|
200
|
+
if not config.username:
|
|
201
|
+
raise ValueError("username is required for RDS IAM authentication")
|
|
202
|
+
|
|
203
|
+
self._rds_iam_token_manager = RDSIAMTokenManager(
|
|
204
|
+
endpoint=hostname,
|
|
205
|
+
username=config.username,
|
|
206
|
+
port=port,
|
|
207
|
+
aws_config=config.aws_config,
|
|
208
|
+
)
|
|
209
|
+
|
|
150
210
|
def get_platform(self):
|
|
151
211
|
return "postgres"
|
|
152
212
|
|
|
@@ -155,13 +215,36 @@ class PostgresSource(SQLAlchemySource):
|
|
|
155
215
|
config = PostgresConfig.parse_obj(config_dict)
|
|
156
216
|
return cls(config, ctx)
|
|
157
217
|
|
|
218
|
+
def _setup_rds_iam_event_listener(
|
|
219
|
+
self, engine: "Engine", database_name: Optional[str] = None
|
|
220
|
+
) -> None:
|
|
221
|
+
"""Setup SQLAlchemy event listener to inject RDS IAM tokens."""
|
|
222
|
+
if not (
|
|
223
|
+
self.config.auth_mode == PostgresAuthMode.AWS_IAM
|
|
224
|
+
and self._rds_iam_token_manager
|
|
225
|
+
):
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
|
|
229
|
+
if not self._rds_iam_token_manager:
|
|
230
|
+
raise RuntimeError("RDS IAM Token Manager is not initialized")
|
|
231
|
+
cparams["password"] = self._rds_iam_token_manager.get_token()
|
|
232
|
+
if cparams.get("sslmode") not in ("require", "verify-ca", "verify-full"):
|
|
233
|
+
cparams["sslmode"] = "require"
|
|
234
|
+
|
|
235
|
+
event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
|
|
236
|
+
|
|
158
237
|
def get_inspectors(self) -> Iterable[Inspector]:
|
|
159
238
|
# Note: get_sql_alchemy_url will choose `sqlalchemy_uri` over the passed in database
|
|
160
239
|
url = self.config.get_sql_alchemy_url(
|
|
161
240
|
database=self.config.database or self.config.initial_database
|
|
162
241
|
)
|
|
242
|
+
|
|
163
243
|
logger.debug(f"sql_alchemy_url={url}")
|
|
244
|
+
|
|
164
245
|
engine = create_engine(url, **self.config.options)
|
|
246
|
+
self._setup_rds_iam_event_listener(engine)
|
|
247
|
+
|
|
165
248
|
with engine.connect() as conn:
|
|
166
249
|
if self.config.database or self.config.sqlalchemy_uri:
|
|
167
250
|
inspector = inspect(conn)
|
|
@@ -169,14 +252,21 @@ class PostgresSource(SQLAlchemySource):
|
|
|
169
252
|
else:
|
|
170
253
|
# pg_database catalog - https://www.postgresql.org/docs/current/catalog-pg-database.html
|
|
171
254
|
# exclude template databases - https://www.postgresql.org/docs/current/manage-ag-templatedbs.html
|
|
255
|
+
# exclude rdsadmin - AWS RDS administrative database
|
|
172
256
|
databases = conn.execute(
|
|
173
|
-
"SELECT datname from pg_database where datname not in ('template0', 'template1')"
|
|
257
|
+
"SELECT datname from pg_database where datname not in ('template0', 'template1', 'rdsadmin')"
|
|
174
258
|
)
|
|
175
259
|
for db in databases:
|
|
176
260
|
if not self.config.database_pattern.allowed(db["datname"]):
|
|
177
261
|
continue
|
|
262
|
+
|
|
178
263
|
url = self.config.get_sql_alchemy_url(database=db["datname"])
|
|
179
|
-
|
|
264
|
+
db_engine = create_engine(url, **self.config.options)
|
|
265
|
+
self._setup_rds_iam_event_listener(
|
|
266
|
+
db_engine, database_name=db["datname"]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
with db_engine.connect() as conn:
|
|
180
270
|
inspector = inspect(conn)
|
|
181
271
|
yield inspector
|
|
182
272
|
|
|
@@ -292,3 +382,49 @@ class PostgresSource(SQLAlchemySource):
|
|
|
292
382
|
] = row.table_size
|
|
293
383
|
except Exception as e:
|
|
294
384
|
logger.error(f"failed to fetch profile metadata: {e}")
|
|
385
|
+
|
|
386
|
+
def get_procedures_for_schema(
|
|
387
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
388
|
+
) -> List[BaseProcedure]:
|
|
389
|
+
"""
|
|
390
|
+
Get stored procedures for a specific schema.
|
|
391
|
+
"""
|
|
392
|
+
base_procedures = []
|
|
393
|
+
with inspector.engine.connect() as conn:
|
|
394
|
+
procedures = conn.execute(
|
|
395
|
+
"""
|
|
396
|
+
SELECT
|
|
397
|
+
p.proname AS name,
|
|
398
|
+
l.lanname AS language,
|
|
399
|
+
pg_get_function_arguments(p.oid) AS arguments,
|
|
400
|
+
pg_get_functiondef(p.oid) AS definition,
|
|
401
|
+
obj_description(p.oid, 'pg_proc') AS comment
|
|
402
|
+
FROM
|
|
403
|
+
pg_proc p
|
|
404
|
+
JOIN
|
|
405
|
+
pg_namespace n ON n.oid = p.pronamespace
|
|
406
|
+
JOIN
|
|
407
|
+
pg_language l ON l.oid = p.prolang
|
|
408
|
+
WHERE
|
|
409
|
+
p.prokind = 'p'
|
|
410
|
+
AND n.nspname = %s;
|
|
411
|
+
""",
|
|
412
|
+
(schema,),
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
procedure_rows = list(procedures)
|
|
416
|
+
for row in procedure_rows:
|
|
417
|
+
base_procedures.append(
|
|
418
|
+
BaseProcedure(
|
|
419
|
+
name=row.name,
|
|
420
|
+
language=row.language,
|
|
421
|
+
argument_signature=row.arguments,
|
|
422
|
+
return_type=None,
|
|
423
|
+
procedure_definition=row.definition,
|
|
424
|
+
created=None,
|
|
425
|
+
last_altered=None,
|
|
426
|
+
comment=row.comment,
|
|
427
|
+
extra_properties=None,
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
return base_procedures
|
|
@@ -8,6 +8,7 @@ from sqlalchemy import exc, sql
|
|
|
8
8
|
from sqlalchemy.engine import reflection
|
|
9
9
|
from sqlalchemy.engine.base import Engine
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
11
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
12
13
|
from datahub.ingestion.api.decorators import (
|
|
13
14
|
SourceCapability,
|
|
@@ -87,7 +88,7 @@ PrestoDialect._get_full_table = _get_full_table
|
|
|
87
88
|
|
|
88
89
|
class PrestoConfig(TrinoConfig):
|
|
89
90
|
# defaults
|
|
90
|
-
scheme: str = Field(default="presto"
|
|
91
|
+
scheme: HiddenFromDocs[str] = Field(default="presto")
|
|
91
92
|
|
|
92
93
|
|
|
93
94
|
@platform_name("Presto", doc_order=1)
|