acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
import urllib.parse
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
@@ -10,9 +10,10 @@ from sqlalchemy import create_engine, inspect
|
|
|
10
10
|
from sqlalchemy.engine.base import Connection
|
|
11
11
|
from sqlalchemy.engine.reflection import Inspector
|
|
12
12
|
from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
13
|
+
from sqlalchemy.sql import quoted_name
|
|
13
14
|
|
|
14
15
|
import datahub.metadata.schema_classes as models
|
|
15
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
16
17
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
17
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -27,6 +28,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
27
28
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
28
29
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
29
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
30
32
|
from datahub.ingestion.source.sql.mssql.job_models import (
|
|
31
33
|
JobStep,
|
|
32
34
|
MSSQLDataFlow,
|
|
@@ -40,7 +42,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
40
42
|
)
|
|
41
43
|
from datahub.ingestion.source.sql.sql_common import (
|
|
42
44
|
SQLAlchemySource,
|
|
43
|
-
SqlWorkUnit,
|
|
44
45
|
register_custom_type,
|
|
45
46
|
)
|
|
46
47
|
from datahub.ingestion.source.sql.sql_config import (
|
|
@@ -74,7 +75,7 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
|
74
75
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
75
76
|
# defaults
|
|
76
77
|
host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
|
|
77
|
-
scheme: str = Field(default="mssql+pytds"
|
|
78
|
+
scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
|
|
78
79
|
|
|
79
80
|
# TODO: rename to include_procedures ?
|
|
80
81
|
include_stored_procedures: bool = Field(
|
|
@@ -130,10 +131,18 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
130
131
|
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
131
132
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
132
133
|
)
|
|
134
|
+
quote_schemas: bool = Field(
|
|
135
|
+
default=False,
|
|
136
|
+
description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
|
|
137
|
+
)
|
|
138
|
+
is_aws_rds: Optional[bool] = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
description="Indicates if the SQL Server instance is running on AWS RDS. When None (default), automatic detection will be attempted using server name analysis.",
|
|
141
|
+
)
|
|
133
142
|
|
|
134
143
|
@pydantic.validator("uri_args")
|
|
135
144
|
def passwords_match(cls, v, values, **kwargs):
|
|
136
|
-
if values["use_odbc"] and "driver" not in v:
|
|
145
|
+
if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
|
|
137
146
|
raise ValueError("uri_args must contain a 'driver' option")
|
|
138
147
|
elif not values["use_odbc"] and v:
|
|
139
148
|
raise ValueError("uri_args is not supported when ODBC is disabled")
|
|
@@ -144,22 +153,36 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
144
153
|
uri_opts: Optional[Dict[str, Any]] = None,
|
|
145
154
|
current_db: Optional[str] = None,
|
|
146
155
|
) -> str:
|
|
156
|
+
current_db = current_db or self.database
|
|
157
|
+
|
|
147
158
|
if self.use_odbc:
|
|
148
159
|
# Ensure that the import is available.
|
|
149
160
|
import pyodbc # noqa: F401
|
|
150
161
|
|
|
151
162
|
self.scheme = "mssql+pyodbc"
|
|
152
163
|
|
|
164
|
+
# ODBC requires a database name, otherwise it will interpret host_port
|
|
165
|
+
# as a pre-defined ODBC connection name.
|
|
166
|
+
current_db = current_db or "master"
|
|
167
|
+
|
|
153
168
|
uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
|
|
154
169
|
self.scheme, # type: ignore
|
|
155
170
|
self.username,
|
|
156
171
|
self.password.get_secret_value() if self.password else None,
|
|
157
172
|
self.host_port, # type: ignore
|
|
158
|
-
current_db
|
|
173
|
+
current_db,
|
|
159
174
|
uri_opts=uri_opts,
|
|
160
175
|
)
|
|
161
176
|
if self.use_odbc:
|
|
162
|
-
|
|
177
|
+
final_uri_args = self.uri_args.copy()
|
|
178
|
+
if final_uri_args and current_db:
|
|
179
|
+
final_uri_args.update({"database": current_db})
|
|
180
|
+
|
|
181
|
+
uri = (
|
|
182
|
+
f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
|
|
183
|
+
if final_uri_args
|
|
184
|
+
else uri
|
|
185
|
+
)
|
|
163
186
|
return uri
|
|
164
187
|
|
|
165
188
|
@property
|
|
@@ -174,7 +197,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
174
197
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
175
198
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
176
199
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
177
|
-
@capability(
|
|
200
|
+
@capability(
|
|
201
|
+
SourceCapability.LINEAGE_COARSE,
|
|
202
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
|
|
203
|
+
subtype_modifier=[
|
|
204
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
205
|
+
SourceCapabilityModifier.VIEW,
|
|
206
|
+
],
|
|
207
|
+
)
|
|
208
|
+
@capability(
|
|
209
|
+
SourceCapability.LINEAGE_FINE,
|
|
210
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
|
|
211
|
+
subtype_modifier=[
|
|
212
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
213
|
+
SourceCapabilityModifier.VIEW,
|
|
214
|
+
],
|
|
215
|
+
)
|
|
178
216
|
class SQLServerSource(SQLAlchemySource):
|
|
179
217
|
"""
|
|
180
218
|
This plugin extracts the following:
|
|
@@ -323,32 +361,186 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
323
361
|
try:
|
|
324
362
|
yield from self.loop_jobs(inspector, self.config)
|
|
325
363
|
except Exception as e:
|
|
326
|
-
self.report.
|
|
327
|
-
"jobs",
|
|
328
|
-
|
|
364
|
+
self.report.failure(
|
|
365
|
+
message="Failed to list jobs",
|
|
366
|
+
title="SQL Server Jobs Extraction",
|
|
367
|
+
context="Error occurred during database-level job extraction",
|
|
368
|
+
exc=e,
|
|
329
369
|
)
|
|
330
370
|
|
|
331
|
-
def
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
371
|
+
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
372
|
+
"""
|
|
373
|
+
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
374
|
+
Uses explicit configuration if provided, otherwise attempts automatic detection.
|
|
375
|
+
Returns True if RDS/managed, False if on-premises.
|
|
376
|
+
"""
|
|
377
|
+
if self.config.is_aws_rds is not None:
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Using explicit is_aws_rds configuration: {self.config.is_aws_rds}"
|
|
380
|
+
)
|
|
381
|
+
return self.config.is_aws_rds
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
result = conn.execute("SELECT @@servername AS server_name")
|
|
385
|
+
server_name_row = result.fetchone()
|
|
386
|
+
if server_name_row:
|
|
387
|
+
server_name = server_name_row["server_name"].lower()
|
|
388
|
+
|
|
389
|
+
aws_indicators = ["amazon", "amzn", "amaz", "ec2", "rds.amazonaws.com"]
|
|
390
|
+
is_rds = any(indicator in server_name for indicator in aws_indicators)
|
|
391
|
+
if is_rds:
|
|
392
|
+
logger.info(f"AWS RDS detected based on server name: {server_name}")
|
|
393
|
+
else:
|
|
394
|
+
logger.info(
|
|
395
|
+
f"Non-RDS environment detected based on server name: {server_name}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return is_rds
|
|
399
|
+
else:
|
|
400
|
+
logger.warning(
|
|
401
|
+
"Could not retrieve server name, assuming non-RDS environment"
|
|
349
402
|
)
|
|
403
|
+
return False
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.warning(
|
|
407
|
+
f"Failed to detect RDS/managed vs on-prem env, assuming non-RDS environment ({e})"
|
|
408
|
+
)
|
|
409
|
+
return False
|
|
350
410
|
|
|
351
411
|
def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
412
|
+
"""
|
|
413
|
+
Get job information with environment detection to choose optimal method first.
|
|
414
|
+
"""
|
|
415
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
416
|
+
|
|
417
|
+
# Detect environment to choose optimal method first
|
|
418
|
+
is_rds = self._detect_rds_environment(conn)
|
|
419
|
+
|
|
420
|
+
if is_rds:
|
|
421
|
+
# Managed environment - try stored procedures first
|
|
422
|
+
try:
|
|
423
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
424
|
+
logger.info(
|
|
425
|
+
"Successfully retrieved jobs using stored procedures (managed environment)"
|
|
426
|
+
)
|
|
427
|
+
return jobs
|
|
428
|
+
except Exception as sp_error:
|
|
429
|
+
logger.warning(
|
|
430
|
+
f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
|
|
431
|
+
)
|
|
432
|
+
# Try direct query as fallback (might work in some managed environments)
|
|
433
|
+
try:
|
|
434
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
435
|
+
logger.info(
|
|
436
|
+
"Successfully retrieved jobs using direct query fallback in managed environment"
|
|
437
|
+
)
|
|
438
|
+
return jobs
|
|
439
|
+
except Exception as direct_error:
|
|
440
|
+
self.report.failure(
|
|
441
|
+
message="Failed to retrieve jobs in managed environment",
|
|
442
|
+
title="SQL Server Jobs Extraction",
|
|
443
|
+
context="Both stored procedures and direct query methods failed",
|
|
444
|
+
exc=direct_error,
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
# On-premises environment - try direct query first (usually faster)
|
|
448
|
+
try:
|
|
449
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
450
|
+
logger.info(
|
|
451
|
+
"Successfully retrieved jobs using direct query (on-premises environment)"
|
|
452
|
+
)
|
|
453
|
+
return jobs
|
|
454
|
+
except Exception as direct_error:
|
|
455
|
+
logger.warning(
|
|
456
|
+
f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
|
|
457
|
+
)
|
|
458
|
+
# Try stored procedures as fallback
|
|
459
|
+
try:
|
|
460
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
461
|
+
logger.info(
|
|
462
|
+
"Successfully retrieved jobs using stored procedures fallback in on-premises environment"
|
|
463
|
+
)
|
|
464
|
+
return jobs
|
|
465
|
+
except Exception as sp_error:
|
|
466
|
+
self.report.failure(
|
|
467
|
+
message="Failed to retrieve jobs in on-premises environment",
|
|
468
|
+
title="SQL Server Jobs Extraction",
|
|
469
|
+
context="Both direct query and stored procedures methods failed",
|
|
470
|
+
exc=sp_error,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
return jobs
|
|
474
|
+
|
|
475
|
+
def _get_jobs_via_stored_procedures(
|
|
476
|
+
self, conn: Connection, db_name: str
|
|
477
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
478
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
479
|
+
|
|
480
|
+
# First, get all jobs
|
|
481
|
+
jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
|
|
482
|
+
jobs_data = {}
|
|
483
|
+
|
|
484
|
+
# SQLAlchemy 1.3 support was dropped in Sept 2023 (PR #8810)
|
|
485
|
+
# SQLAlchemy 1.4+ returns LegacyRow objects that don't support dictionary-style .get() method
|
|
486
|
+
# Use .mappings() to get MappingResult with dictionary-like rows that support .get()
|
|
487
|
+
for row in jobs_result.mappings():
|
|
488
|
+
job_id = str(row["job_id"])
|
|
489
|
+
jobs_data[job_id] = {
|
|
490
|
+
"job_id": job_id,
|
|
491
|
+
"name": row["name"],
|
|
492
|
+
"description": row.get("description", ""),
|
|
493
|
+
"date_created": row.get("date_created"),
|
|
494
|
+
"date_modified": row.get("date_modified"),
|
|
495
|
+
"enabled": row.get("enabled", 1),
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
# Now get job steps for each job, filtering by database
|
|
499
|
+
for job_id, job_info in jobs_data.items():
|
|
500
|
+
try:
|
|
501
|
+
# Get steps for this specific job
|
|
502
|
+
steps_result = conn.execute(
|
|
503
|
+
f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
job_steps = {}
|
|
507
|
+
# Use .mappings() for dictionary-like access (SQLAlchemy 1.4+ compatibility)
|
|
508
|
+
for step_row in steps_result.mappings():
|
|
509
|
+
# Only include steps that run against our target database
|
|
510
|
+
step_database = step_row.get("database_name", "")
|
|
511
|
+
if step_database.lower() == db_name.lower() or not step_database:
|
|
512
|
+
step_data = {
|
|
513
|
+
"job_id": job_id,
|
|
514
|
+
"job_name": job_info["name"],
|
|
515
|
+
"description": job_info["description"],
|
|
516
|
+
"date_created": job_info["date_created"],
|
|
517
|
+
"date_modified": job_info["date_modified"],
|
|
518
|
+
"step_id": step_row["step_id"],
|
|
519
|
+
"step_name": step_row["step_name"],
|
|
520
|
+
"subsystem": step_row.get("subsystem", ""),
|
|
521
|
+
"command": step_row.get("command", ""),
|
|
522
|
+
"database_name": step_database,
|
|
523
|
+
}
|
|
524
|
+
job_steps[step_row["step_id"]] = step_data
|
|
525
|
+
|
|
526
|
+
# Only add job if it has relevant steps
|
|
527
|
+
if job_steps:
|
|
528
|
+
jobs[job_info["name"]] = job_steps
|
|
529
|
+
|
|
530
|
+
except Exception as step_error:
|
|
531
|
+
logger.warning(
|
|
532
|
+
f"Failed to get steps for job {job_info['name']}: {step_error}"
|
|
533
|
+
)
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
return jobs
|
|
537
|
+
|
|
538
|
+
def _get_jobs_via_direct_query(
|
|
539
|
+
self, conn: Connection, db_name: str
|
|
540
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
541
|
+
"""
|
|
542
|
+
Original method using direct table access for on-premises SQL Server.
|
|
543
|
+
"""
|
|
352
544
|
jobs_data = conn.execute(
|
|
353
545
|
f"""
|
|
354
546
|
SELECT
|
|
@@ -371,6 +563,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
371
563
|
where database_name = '{db_name}'
|
|
372
564
|
"""
|
|
373
565
|
)
|
|
566
|
+
|
|
374
567
|
jobs: Dict[str, Dict[str, Any]] = {}
|
|
375
568
|
for row in jobs_data:
|
|
376
569
|
step_data = dict(
|
|
@@ -383,11 +576,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
383
576
|
step_name=row["step_name"],
|
|
384
577
|
subsystem=row["subsystem"],
|
|
385
578
|
command=row["command"],
|
|
579
|
+
database_name=row["database_name"],
|
|
386
580
|
)
|
|
387
581
|
if row["name"] in jobs:
|
|
388
582
|
jobs[row["name"]][row["step_id"]] = step_data
|
|
389
583
|
else:
|
|
390
584
|
jobs[row["name"]] = {row["step_id"]: step_data}
|
|
585
|
+
|
|
391
586
|
return jobs
|
|
392
587
|
|
|
393
588
|
def loop_jobs(
|
|
@@ -397,21 +592,59 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
397
592
|
) -> Iterable[MetadataWorkUnit]:
|
|
398
593
|
"""
|
|
399
594
|
Loop MS SQL jobs as dataFlow-s.
|
|
400
|
-
|
|
595
|
+
Now supports both managed and on-premises SQL Server.
|
|
401
596
|
"""
|
|
402
597
|
db_name = self.get_db_name(inspector)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
598
|
+
|
|
599
|
+
try:
|
|
600
|
+
with inspector.engine.connect() as conn:
|
|
601
|
+
jobs = self._get_jobs(conn, db_name)
|
|
602
|
+
|
|
603
|
+
if not jobs:
|
|
604
|
+
logger.info(f"No jobs found for database: {db_name}")
|
|
605
|
+
return
|
|
606
|
+
|
|
607
|
+
logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
|
|
608
|
+
|
|
609
|
+
for job_name, job_steps in jobs.items():
|
|
610
|
+
try:
|
|
611
|
+
job = MSSQLJob(
|
|
612
|
+
name=job_name,
|
|
613
|
+
env=sql_config.env,
|
|
614
|
+
db=db_name,
|
|
615
|
+
platform_instance=sql_config.platform_instance,
|
|
616
|
+
)
|
|
617
|
+
data_flow = MSSQLDataFlow(entity=job)
|
|
618
|
+
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
619
|
+
yield from self.loop_job_steps(job, job_steps)
|
|
620
|
+
|
|
621
|
+
except Exception as job_error:
|
|
622
|
+
logger.warning(f"Failed to process job {job_name}: {job_error}")
|
|
623
|
+
self.report.warning(
|
|
624
|
+
message=f"Failed to process job {job_name}",
|
|
625
|
+
title="SQL Server Jobs Extraction",
|
|
626
|
+
context="Error occurred while processing individual job",
|
|
627
|
+
exc=job_error,
|
|
628
|
+
)
|
|
629
|
+
continue
|
|
630
|
+
|
|
631
|
+
except Exception as e:
|
|
632
|
+
error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
|
|
633
|
+
logger.error(error_message)
|
|
634
|
+
|
|
635
|
+
# Provide specific guidance for permission issues
|
|
636
|
+
if "permission" in str(e).lower() or "denied" in str(e).lower():
|
|
637
|
+
permission_guidance = (
|
|
638
|
+
"For managed SQL Server services, ensure the following permissions are granted:\n"
|
|
639
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
|
|
640
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
|
|
641
|
+
"For on-premises SQL Server, you may also need:\n"
|
|
642
|
+
"GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
|
|
643
|
+
"GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
|
|
411
644
|
)
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
645
|
+
logger.info(permission_guidance)
|
|
646
|
+
|
|
647
|
+
raise e
|
|
415
648
|
|
|
416
649
|
def loop_job_steps(
|
|
417
650
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
@@ -431,7 +664,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
431
664
|
self,
|
|
432
665
|
inspector: Inspector,
|
|
433
666
|
schema: str,
|
|
434
|
-
sql_config: SQLServerConfig,
|
|
667
|
+
sql_config: SQLServerConfig, # type: ignore
|
|
435
668
|
) -> Iterable[MetadataWorkUnit]:
|
|
436
669
|
"""
|
|
437
670
|
Loop schema data for get stored procedures as dataJob-s.
|
|
@@ -740,25 +973,29 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
740
973
|
url = self.config.get_sql_alchemy_url()
|
|
741
974
|
logger.debug(f"sql_alchemy_url={url}")
|
|
742
975
|
engine = create_engine(url, **self.config.options)
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
976
|
+
|
|
977
|
+
if (
|
|
978
|
+
self.config.database
|
|
979
|
+
and self.config.database != ""
|
|
980
|
+
or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
|
|
981
|
+
):
|
|
982
|
+
inspector = inspect(engine)
|
|
983
|
+
yield inspector
|
|
984
|
+
else:
|
|
985
|
+
with engine.begin() as conn:
|
|
748
986
|
databases = conn.execute(
|
|
749
987
|
"SELECT name FROM master.sys.databases WHERE name NOT IN \
|
|
750
988
|
('master', 'model', 'msdb', 'tempdb', 'Resource', \
|
|
751
989
|
'distribution' , 'reportserver', 'reportservertempdb'); "
|
|
752
|
-
)
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
yield inspector
|
|
990
|
+
).fetchall()
|
|
991
|
+
|
|
992
|
+
for db in databases:
|
|
993
|
+
if self.config.database_pattern.allowed(db["name"]):
|
|
994
|
+
url = self.config.get_sql_alchemy_url(current_db=db["name"])
|
|
995
|
+
engine = create_engine(url, **self.config.options)
|
|
996
|
+
inspector = inspect(engine)
|
|
997
|
+
self.current_database = db["name"]
|
|
998
|
+
yield inspector
|
|
762
999
|
|
|
763
1000
|
def get_identifier(
|
|
764
1001
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
@@ -838,3 +1075,45 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
838
1075
|
if self.config.convert_urns_to_lowercase
|
|
839
1076
|
else table_ref_str
|
|
840
1077
|
)
|
|
1078
|
+
|
|
1079
|
+
def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
|
|
1080
|
+
for schema in super().get_allowed_schemas(inspector, db_name):
|
|
1081
|
+
if self.config.quote_schemas:
|
|
1082
|
+
yield quoted_name(schema, True)
|
|
1083
|
+
else:
|
|
1084
|
+
yield schema
|
|
1085
|
+
|
|
1086
|
+
def get_db_name(self, inspector: Inspector) -> str:
|
|
1087
|
+
engine = inspector.engine
|
|
1088
|
+
|
|
1089
|
+
try:
|
|
1090
|
+
if (
|
|
1091
|
+
engine
|
|
1092
|
+
and hasattr(engine, "url")
|
|
1093
|
+
and hasattr(engine.url, "database")
|
|
1094
|
+
and engine.url.database
|
|
1095
|
+
):
|
|
1096
|
+
return str(engine.url.database).strip('"')
|
|
1097
|
+
|
|
1098
|
+
if (
|
|
1099
|
+
engine
|
|
1100
|
+
and hasattr(engine, "url")
|
|
1101
|
+
and hasattr(engine.url, "query")
|
|
1102
|
+
and "odbc_connect" in engine.url.query
|
|
1103
|
+
):
|
|
1104
|
+
# According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
|
|
1105
|
+
database = re.search(
|
|
1106
|
+
r"DATABASE=([^;]*);",
|
|
1107
|
+
urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
|
|
1108
|
+
flags=re.IGNORECASE,
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
if database and database.group(1):
|
|
1112
|
+
return database.group(1)
|
|
1113
|
+
|
|
1114
|
+
return ""
|
|
1115
|
+
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
raise RuntimeError(
|
|
1118
|
+
"Unable to get database name from Sqlalchemy inspector"
|
|
1119
|
+
) from e
|