acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
import urllib.parse
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
@@ -10,9 +10,11 @@ from sqlalchemy import create_engine, inspect
|
|
|
10
10
|
from sqlalchemy.engine.base import Connection
|
|
11
11
|
from sqlalchemy.engine.reflection import Inspector
|
|
12
12
|
from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
13
|
+
from sqlalchemy.sql import quoted_name
|
|
13
14
|
|
|
14
15
|
import datahub.metadata.schema_classes as models
|
|
15
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
17
|
+
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
16
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
20
|
from datahub.ingestion.api.decorators import (
|
|
@@ -26,6 +28,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
28
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
27
29
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
28
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
29
32
|
from datahub.ingestion.source.sql.mssql.job_models import (
|
|
30
33
|
JobStep,
|
|
31
34
|
MSSQLDataFlow,
|
|
@@ -37,19 +40,18 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
37
40
|
ProcedureParameter,
|
|
38
41
|
StoredProcedure,
|
|
39
42
|
)
|
|
40
|
-
from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
|
|
41
|
-
generate_procedure_lineage,
|
|
42
|
-
)
|
|
43
43
|
from datahub.ingestion.source.sql.sql_common import (
|
|
44
44
|
SQLAlchemySource,
|
|
45
|
-
SqlWorkUnit,
|
|
46
45
|
register_custom_type,
|
|
47
46
|
)
|
|
48
47
|
from datahub.ingestion.source.sql.sql_config import (
|
|
49
48
|
BasicSQLAlchemyConfig,
|
|
50
|
-
make_sqlalchemy_uri,
|
|
51
49
|
)
|
|
52
50
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
51
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
52
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
53
|
+
generate_procedure_lineage,
|
|
54
|
+
)
|
|
53
55
|
from datahub.utilities.file_backed_collections import FileBackedList
|
|
54
56
|
|
|
55
57
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -60,11 +62,22 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
|
|
|
60
62
|
register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
|
|
61
63
|
register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
|
|
62
64
|
|
|
65
|
+
# Patterns copied from Snowflake source
|
|
66
|
+
DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
67
|
+
r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
|
|
68
|
+
r".*__DBT_TMP$", # dbt
|
|
69
|
+
rf".*\.SEGMENT_{UUID_REGEX}", # segment
|
|
70
|
+
rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
|
|
71
|
+
r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
|
|
72
|
+
]
|
|
73
|
+
|
|
63
74
|
|
|
64
75
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
65
76
|
# defaults
|
|
66
77
|
host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
|
|
67
|
-
scheme: str = Field(default="mssql+pytds"
|
|
78
|
+
scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
|
|
79
|
+
|
|
80
|
+
# TODO: rename to include_procedures ?
|
|
68
81
|
include_stored_procedures: bool = Field(
|
|
69
82
|
default=True,
|
|
70
83
|
description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
|
|
@@ -112,10 +125,24 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
112
125
|
default=False,
|
|
113
126
|
description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
|
|
114
127
|
)
|
|
128
|
+
temporary_tables_pattern: List[str] = Field(
|
|
129
|
+
default=DEFAULT_TEMP_TABLES_PATTERNS,
|
|
130
|
+
description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
|
|
131
|
+
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
132
|
+
"to ignore the temporary staging tables created by known ETL tools.",
|
|
133
|
+
)
|
|
134
|
+
quote_schemas: bool = Field(
|
|
135
|
+
default=False,
|
|
136
|
+
description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
|
|
137
|
+
)
|
|
138
|
+
is_aws_rds: Optional[bool] = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
description="Indicates if the SQL Server instance is running on AWS RDS. When None (default), automatic detection will be attempted using server name analysis.",
|
|
141
|
+
)
|
|
115
142
|
|
|
116
143
|
@pydantic.validator("uri_args")
|
|
117
144
|
def passwords_match(cls, v, values, **kwargs):
|
|
118
|
-
if values["use_odbc"] and "driver" not in v:
|
|
145
|
+
if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
|
|
119
146
|
raise ValueError("uri_args must contain a 'driver' option")
|
|
120
147
|
elif not values["use_odbc"] and v:
|
|
121
148
|
raise ValueError("uri_args is not supported when ODBC is disabled")
|
|
@@ -126,22 +153,36 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
126
153
|
uri_opts: Optional[Dict[str, Any]] = None,
|
|
127
154
|
current_db: Optional[str] = None,
|
|
128
155
|
) -> str:
|
|
156
|
+
current_db = current_db or self.database
|
|
157
|
+
|
|
129
158
|
if self.use_odbc:
|
|
130
159
|
# Ensure that the import is available.
|
|
131
160
|
import pyodbc # noqa: F401
|
|
132
161
|
|
|
133
162
|
self.scheme = "mssql+pyodbc"
|
|
134
163
|
|
|
164
|
+
# ODBC requires a database name, otherwise it will interpret host_port
|
|
165
|
+
# as a pre-defined ODBC connection name.
|
|
166
|
+
current_db = current_db or "master"
|
|
167
|
+
|
|
135
168
|
uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
|
|
136
169
|
self.scheme, # type: ignore
|
|
137
170
|
self.username,
|
|
138
171
|
self.password.get_secret_value() if self.password else None,
|
|
139
172
|
self.host_port, # type: ignore
|
|
140
|
-
current_db
|
|
173
|
+
current_db,
|
|
141
174
|
uri_opts=uri_opts,
|
|
142
175
|
)
|
|
143
176
|
if self.use_odbc:
|
|
144
|
-
|
|
177
|
+
final_uri_args = self.uri_args.copy()
|
|
178
|
+
if final_uri_args and current_db:
|
|
179
|
+
final_uri_args.update({"database": current_db})
|
|
180
|
+
|
|
181
|
+
uri = (
|
|
182
|
+
f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
|
|
183
|
+
if final_uri_args
|
|
184
|
+
else uri
|
|
185
|
+
)
|
|
145
186
|
return uri
|
|
146
187
|
|
|
147
188
|
@property
|
|
@@ -156,7 +197,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
156
197
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
157
198
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
158
199
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
159
|
-
@capability(
|
|
200
|
+
@capability(
|
|
201
|
+
SourceCapability.LINEAGE_COARSE,
|
|
202
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
|
|
203
|
+
subtype_modifier=[
|
|
204
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
205
|
+
SourceCapabilityModifier.VIEW,
|
|
206
|
+
],
|
|
207
|
+
)
|
|
208
|
+
@capability(
|
|
209
|
+
SourceCapability.LINEAGE_FINE,
|
|
210
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
|
|
211
|
+
subtype_modifier=[
|
|
212
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
213
|
+
SourceCapabilityModifier.VIEW,
|
|
214
|
+
],
|
|
215
|
+
)
|
|
160
216
|
class SQLServerSource(SQLAlchemySource):
|
|
161
217
|
"""
|
|
162
218
|
This plugin extracts the following:
|
|
@@ -177,6 +233,14 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
177
233
|
self.table_descriptions: Dict[str, str] = {}
|
|
178
234
|
self.column_descriptions: Dict[str, str] = {}
|
|
179
235
|
self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
|
|
236
|
+
|
|
237
|
+
self.report = SQLSourceReport()
|
|
238
|
+
if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
|
|
239
|
+
self.report.warning(
|
|
240
|
+
title="Potential issue with lineage",
|
|
241
|
+
message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
|
|
242
|
+
)
|
|
243
|
+
|
|
180
244
|
if self.config.include_descriptions:
|
|
181
245
|
for inspector in self.get_inspectors():
|
|
182
246
|
db_name: str = self.get_db_name(inspector)
|
|
@@ -297,32 +361,186 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
297
361
|
try:
|
|
298
362
|
yield from self.loop_jobs(inspector, self.config)
|
|
299
363
|
except Exception as e:
|
|
300
|
-
self.report.
|
|
301
|
-
"jobs",
|
|
302
|
-
|
|
364
|
+
self.report.failure(
|
|
365
|
+
message="Failed to list jobs",
|
|
366
|
+
title="SQL Server Jobs Extraction",
|
|
367
|
+
context="Error occurred during database-level job extraction",
|
|
368
|
+
exc=e,
|
|
303
369
|
)
|
|
304
370
|
|
|
305
|
-
def
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
371
|
+
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
372
|
+
"""
|
|
373
|
+
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
374
|
+
Uses explicit configuration if provided, otherwise attempts automatic detection.
|
|
375
|
+
Returns True if RDS/managed, False if on-premises.
|
|
376
|
+
"""
|
|
377
|
+
if self.config.is_aws_rds is not None:
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Using explicit is_aws_rds configuration: {self.config.is_aws_rds}"
|
|
380
|
+
)
|
|
381
|
+
return self.config.is_aws_rds
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
result = conn.execute("SELECT @@servername AS server_name")
|
|
385
|
+
server_name_row = result.fetchone()
|
|
386
|
+
if server_name_row:
|
|
387
|
+
server_name = server_name_row["server_name"].lower()
|
|
388
|
+
|
|
389
|
+
aws_indicators = ["amazon", "amzn", "amaz", "ec2", "rds.amazonaws.com"]
|
|
390
|
+
is_rds = any(indicator in server_name for indicator in aws_indicators)
|
|
391
|
+
if is_rds:
|
|
392
|
+
logger.info(f"AWS RDS detected based on server name: {server_name}")
|
|
393
|
+
else:
|
|
394
|
+
logger.info(
|
|
395
|
+
f"Non-RDS environment detected based on server name: {server_name}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return is_rds
|
|
399
|
+
else:
|
|
400
|
+
logger.warning(
|
|
401
|
+
"Could not retrieve server name, assuming non-RDS environment"
|
|
323
402
|
)
|
|
403
|
+
return False
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.warning(
|
|
407
|
+
f"Failed to detect RDS/managed vs on-prem env, assuming non-RDS environment ({e})"
|
|
408
|
+
)
|
|
409
|
+
return False
|
|
324
410
|
|
|
325
411
|
def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
412
|
+
"""
|
|
413
|
+
Get job information with environment detection to choose optimal method first.
|
|
414
|
+
"""
|
|
415
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
416
|
+
|
|
417
|
+
# Detect environment to choose optimal method first
|
|
418
|
+
is_rds = self._detect_rds_environment(conn)
|
|
419
|
+
|
|
420
|
+
if is_rds:
|
|
421
|
+
# Managed environment - try stored procedures first
|
|
422
|
+
try:
|
|
423
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
424
|
+
logger.info(
|
|
425
|
+
"Successfully retrieved jobs using stored procedures (managed environment)"
|
|
426
|
+
)
|
|
427
|
+
return jobs
|
|
428
|
+
except Exception as sp_error:
|
|
429
|
+
logger.warning(
|
|
430
|
+
f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
|
|
431
|
+
)
|
|
432
|
+
# Try direct query as fallback (might work in some managed environments)
|
|
433
|
+
try:
|
|
434
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
435
|
+
logger.info(
|
|
436
|
+
"Successfully retrieved jobs using direct query fallback in managed environment"
|
|
437
|
+
)
|
|
438
|
+
return jobs
|
|
439
|
+
except Exception as direct_error:
|
|
440
|
+
self.report.failure(
|
|
441
|
+
message="Failed to retrieve jobs in managed environment",
|
|
442
|
+
title="SQL Server Jobs Extraction",
|
|
443
|
+
context="Both stored procedures and direct query methods failed",
|
|
444
|
+
exc=direct_error,
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
# On-premises environment - try direct query first (usually faster)
|
|
448
|
+
try:
|
|
449
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
450
|
+
logger.info(
|
|
451
|
+
"Successfully retrieved jobs using direct query (on-premises environment)"
|
|
452
|
+
)
|
|
453
|
+
return jobs
|
|
454
|
+
except Exception as direct_error:
|
|
455
|
+
logger.warning(
|
|
456
|
+
f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
|
|
457
|
+
)
|
|
458
|
+
# Try stored procedures as fallback
|
|
459
|
+
try:
|
|
460
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
461
|
+
logger.info(
|
|
462
|
+
"Successfully retrieved jobs using stored procedures fallback in on-premises environment"
|
|
463
|
+
)
|
|
464
|
+
return jobs
|
|
465
|
+
except Exception as sp_error:
|
|
466
|
+
self.report.failure(
|
|
467
|
+
message="Failed to retrieve jobs in on-premises environment",
|
|
468
|
+
title="SQL Server Jobs Extraction",
|
|
469
|
+
context="Both direct query and stored procedures methods failed",
|
|
470
|
+
exc=sp_error,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
return jobs
|
|
474
|
+
|
|
475
|
+
def _get_jobs_via_stored_procedures(
|
|
476
|
+
self, conn: Connection, db_name: str
|
|
477
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
478
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
479
|
+
|
|
480
|
+
# First, get all jobs
|
|
481
|
+
jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
|
|
482
|
+
jobs_data = {}
|
|
483
|
+
|
|
484
|
+
# SQLAlchemy 1.3 support was dropped in Sept 2023 (PR #8810)
|
|
485
|
+
# SQLAlchemy 1.4+ returns LegacyRow objects that don't support dictionary-style .get() method
|
|
486
|
+
# Use .mappings() to get MappingResult with dictionary-like rows that support .get()
|
|
487
|
+
for row in jobs_result.mappings():
|
|
488
|
+
job_id = str(row["job_id"])
|
|
489
|
+
jobs_data[job_id] = {
|
|
490
|
+
"job_id": job_id,
|
|
491
|
+
"name": row["name"],
|
|
492
|
+
"description": row.get("description", ""),
|
|
493
|
+
"date_created": row.get("date_created"),
|
|
494
|
+
"date_modified": row.get("date_modified"),
|
|
495
|
+
"enabled": row.get("enabled", 1),
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
# Now get job steps for each job, filtering by database
|
|
499
|
+
for job_id, job_info in jobs_data.items():
|
|
500
|
+
try:
|
|
501
|
+
# Get steps for this specific job
|
|
502
|
+
steps_result = conn.execute(
|
|
503
|
+
f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
job_steps = {}
|
|
507
|
+
# Use .mappings() for dictionary-like access (SQLAlchemy 1.4+ compatibility)
|
|
508
|
+
for step_row in steps_result.mappings():
|
|
509
|
+
# Only include steps that run against our target database
|
|
510
|
+
step_database = step_row.get("database_name", "")
|
|
511
|
+
if step_database.lower() == db_name.lower() or not step_database:
|
|
512
|
+
step_data = {
|
|
513
|
+
"job_id": job_id,
|
|
514
|
+
"job_name": job_info["name"],
|
|
515
|
+
"description": job_info["description"],
|
|
516
|
+
"date_created": job_info["date_created"],
|
|
517
|
+
"date_modified": job_info["date_modified"],
|
|
518
|
+
"step_id": step_row["step_id"],
|
|
519
|
+
"step_name": step_row["step_name"],
|
|
520
|
+
"subsystem": step_row.get("subsystem", ""),
|
|
521
|
+
"command": step_row.get("command", ""),
|
|
522
|
+
"database_name": step_database,
|
|
523
|
+
}
|
|
524
|
+
job_steps[step_row["step_id"]] = step_data
|
|
525
|
+
|
|
526
|
+
# Only add job if it has relevant steps
|
|
527
|
+
if job_steps:
|
|
528
|
+
jobs[job_info["name"]] = job_steps
|
|
529
|
+
|
|
530
|
+
except Exception as step_error:
|
|
531
|
+
logger.warning(
|
|
532
|
+
f"Failed to get steps for job {job_info['name']}: {step_error}"
|
|
533
|
+
)
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
return jobs
|
|
537
|
+
|
|
538
|
+
def _get_jobs_via_direct_query(
|
|
539
|
+
self, conn: Connection, db_name: str
|
|
540
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
541
|
+
"""
|
|
542
|
+
Original method using direct table access for on-premises SQL Server.
|
|
543
|
+
"""
|
|
326
544
|
jobs_data = conn.execute(
|
|
327
545
|
f"""
|
|
328
546
|
SELECT
|
|
@@ -345,6 +563,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
345
563
|
where database_name = '{db_name}'
|
|
346
564
|
"""
|
|
347
565
|
)
|
|
566
|
+
|
|
348
567
|
jobs: Dict[str, Dict[str, Any]] = {}
|
|
349
568
|
for row in jobs_data:
|
|
350
569
|
step_data = dict(
|
|
@@ -357,11 +576,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
357
576
|
step_name=row["step_name"],
|
|
358
577
|
subsystem=row["subsystem"],
|
|
359
578
|
command=row["command"],
|
|
579
|
+
database_name=row["database_name"],
|
|
360
580
|
)
|
|
361
581
|
if row["name"] in jobs:
|
|
362
582
|
jobs[row["name"]][row["step_id"]] = step_data
|
|
363
583
|
else:
|
|
364
584
|
jobs[row["name"]] = {row["step_id"]: step_data}
|
|
585
|
+
|
|
365
586
|
return jobs
|
|
366
587
|
|
|
367
588
|
def loop_jobs(
|
|
@@ -371,21 +592,59 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
371
592
|
) -> Iterable[MetadataWorkUnit]:
|
|
372
593
|
"""
|
|
373
594
|
Loop MS SQL jobs as dataFlow-s.
|
|
374
|
-
|
|
595
|
+
Now supports both managed and on-premises SQL Server.
|
|
375
596
|
"""
|
|
376
597
|
db_name = self.get_db_name(inspector)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
598
|
+
|
|
599
|
+
try:
|
|
600
|
+
with inspector.engine.connect() as conn:
|
|
601
|
+
jobs = self._get_jobs(conn, db_name)
|
|
602
|
+
|
|
603
|
+
if not jobs:
|
|
604
|
+
logger.info(f"No jobs found for database: {db_name}")
|
|
605
|
+
return
|
|
606
|
+
|
|
607
|
+
logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
|
|
608
|
+
|
|
609
|
+
for job_name, job_steps in jobs.items():
|
|
610
|
+
try:
|
|
611
|
+
job = MSSQLJob(
|
|
612
|
+
name=job_name,
|
|
613
|
+
env=sql_config.env,
|
|
614
|
+
db=db_name,
|
|
615
|
+
platform_instance=sql_config.platform_instance,
|
|
616
|
+
)
|
|
617
|
+
data_flow = MSSQLDataFlow(entity=job)
|
|
618
|
+
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
619
|
+
yield from self.loop_job_steps(job, job_steps)
|
|
620
|
+
|
|
621
|
+
except Exception as job_error:
|
|
622
|
+
logger.warning(f"Failed to process job {job_name}: {job_error}")
|
|
623
|
+
self.report.warning(
|
|
624
|
+
message=f"Failed to process job {job_name}",
|
|
625
|
+
title="SQL Server Jobs Extraction",
|
|
626
|
+
context="Error occurred while processing individual job",
|
|
627
|
+
exc=job_error,
|
|
628
|
+
)
|
|
629
|
+
continue
|
|
630
|
+
|
|
631
|
+
except Exception as e:
|
|
632
|
+
error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
|
|
633
|
+
logger.error(error_message)
|
|
634
|
+
|
|
635
|
+
# Provide specific guidance for permission issues
|
|
636
|
+
if "permission" in str(e).lower() or "denied" in str(e).lower():
|
|
637
|
+
permission_guidance = (
|
|
638
|
+
"For managed SQL Server services, ensure the following permissions are granted:\n"
|
|
639
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
|
|
640
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
|
|
641
|
+
"For on-premises SQL Server, you may also need:\n"
|
|
642
|
+
"GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
|
|
643
|
+
"GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
|
|
385
644
|
)
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
645
|
+
logger.info(permission_guidance)
|
|
646
|
+
|
|
647
|
+
raise e
|
|
389
648
|
|
|
390
649
|
def loop_job_steps(
|
|
391
650
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
@@ -405,7 +664,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
405
664
|
self,
|
|
406
665
|
inspector: Inspector,
|
|
407
666
|
schema: str,
|
|
408
|
-
sql_config: SQLServerConfig,
|
|
667
|
+
sql_config: SQLServerConfig, # type: ignore
|
|
409
668
|
) -> Iterable[MetadataWorkUnit]:
|
|
410
669
|
"""
|
|
411
670
|
Loop schema data for get stored procedures as dataJob-s.
|
|
@@ -714,25 +973,29 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
714
973
|
url = self.config.get_sql_alchemy_url()
|
|
715
974
|
logger.debug(f"sql_alchemy_url={url}")
|
|
716
975
|
engine = create_engine(url, **self.config.options)
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
976
|
+
|
|
977
|
+
if (
|
|
978
|
+
self.config.database
|
|
979
|
+
and self.config.database != ""
|
|
980
|
+
or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
|
|
981
|
+
):
|
|
982
|
+
inspector = inspect(engine)
|
|
983
|
+
yield inspector
|
|
984
|
+
else:
|
|
985
|
+
with engine.begin() as conn:
|
|
722
986
|
databases = conn.execute(
|
|
723
987
|
"SELECT name FROM master.sys.databases WHERE name NOT IN \
|
|
724
988
|
('master', 'model', 'msdb', 'tempdb', 'Resource', \
|
|
725
989
|
'distribution' , 'reportserver', 'reportservertempdb'); "
|
|
726
|
-
)
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
yield inspector
|
|
990
|
+
).fetchall()
|
|
991
|
+
|
|
992
|
+
for db in databases:
|
|
993
|
+
if self.config.database_pattern.allowed(db["name"]):
|
|
994
|
+
url = self.config.get_sql_alchemy_url(current_db=db["name"])
|
|
995
|
+
engine = create_engine(url, **self.config.options)
|
|
996
|
+
inspector = inspect(engine)
|
|
997
|
+
self.current_database = db["name"]
|
|
998
|
+
yield inspector
|
|
736
999
|
|
|
737
1000
|
def get_identifier(
|
|
738
1001
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
@@ -763,13 +1026,22 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
763
1026
|
yield from auto_workunit(
|
|
764
1027
|
generate_procedure_lineage(
|
|
765
1028
|
schema_resolver=self.get_schema_resolver(),
|
|
766
|
-
procedure=procedure,
|
|
1029
|
+
procedure=procedure.to_base_procedure(),
|
|
767
1030
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
768
1031
|
is_temp_table=self.is_temp_table,
|
|
1032
|
+
default_db=procedure.db,
|
|
1033
|
+
default_schema=procedure.schema,
|
|
769
1034
|
)
|
|
770
1035
|
)
|
|
771
1036
|
|
|
772
1037
|
def is_temp_table(self, name: str) -> bool:
|
|
1038
|
+
if any(
|
|
1039
|
+
re.match(pattern, name, flags=re.IGNORECASE)
|
|
1040
|
+
for pattern in self.config.temporary_tables_pattern
|
|
1041
|
+
):
|
|
1042
|
+
logger.debug(f"temp table matched by pattern {name}")
|
|
1043
|
+
return True
|
|
1044
|
+
|
|
773
1045
|
try:
|
|
774
1046
|
parts = name.split(".")
|
|
775
1047
|
table_name = parts[-1]
|
|
@@ -803,3 +1075,45 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
803
1075
|
if self.config.convert_urns_to_lowercase
|
|
804
1076
|
else table_ref_str
|
|
805
1077
|
)
|
|
1078
|
+
|
|
1079
|
+
def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
|
|
1080
|
+
for schema in super().get_allowed_schemas(inspector, db_name):
|
|
1081
|
+
if self.config.quote_schemas:
|
|
1082
|
+
yield quoted_name(schema, True)
|
|
1083
|
+
else:
|
|
1084
|
+
yield schema
|
|
1085
|
+
|
|
1086
|
+
def get_db_name(self, inspector: Inspector) -> str:
|
|
1087
|
+
engine = inspector.engine
|
|
1088
|
+
|
|
1089
|
+
try:
|
|
1090
|
+
if (
|
|
1091
|
+
engine
|
|
1092
|
+
and hasattr(engine, "url")
|
|
1093
|
+
and hasattr(engine.url, "database")
|
|
1094
|
+
and engine.url.database
|
|
1095
|
+
):
|
|
1096
|
+
return str(engine.url.database).strip('"')
|
|
1097
|
+
|
|
1098
|
+
if (
|
|
1099
|
+
engine
|
|
1100
|
+
and hasattr(engine, "url")
|
|
1101
|
+
and hasattr(engine.url, "query")
|
|
1102
|
+
and "odbc_connect" in engine.url.query
|
|
1103
|
+
):
|
|
1104
|
+
# According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
|
|
1105
|
+
database = re.search(
|
|
1106
|
+
r"DATABASE=([^;]*);",
|
|
1107
|
+
urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
|
|
1108
|
+
flags=re.IGNORECASE,
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
if database and database.group(1):
|
|
1112
|
+
return database.group(1)
|
|
1113
|
+
|
|
1114
|
+
return ""
|
|
1115
|
+
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
raise RuntimeError(
|
|
1118
|
+
"Unable to get database name from Sqlalchemy inspector"
|
|
1119
|
+
) from e
|