PyPI - acryl-datahub - Versions diffs - 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show

{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +12 -16
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +71 -13
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1446 -559
datahub/metadata/_urns/urn_defs.py +1721 -1553
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +18055 -17802
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/sdk/__init__.py +6 -0
datahub/sdk/_all_entities.py +11 -0
datahub/sdk/_shared.py +118 -1
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +367 -0
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +90 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/sql/clickhouse.py CHANGED Viewed

@@ -379,7 +379,9 @@ clickhouse_datetime_format = "%Y-%m-%d %H:%M:%S"
 @platform_name("ClickHouse")
 @config_class(ClickHouseConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+@capability(
+    SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
+)
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 class ClickHouseSource(TwoTierSQLAlchemySource):
     """

datahub/ingestion/source/sql/cockroachdb.py CHANGED Viewed

@@ -26,7 +26,6 @@ class CockroachDBConfig(PostgresConfig):
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
 class CockroachDBSource(PostgresSource):
     config: CockroachDBConfig

datahub/ingestion/source/sql/hana.py CHANGED Viewed

@@ -27,7 +27,9 @@ class HanaConfig(BasicSQLAlchemyConfig):
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+@capability(
+    SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
+)
 class HanaSource(SQLAlchemySource):
     def __init__(self, config: HanaConfig, ctx: PipelineContext):
         super().__init__(config, ctx, "hana")

datahub/ingestion/source/sql/hive_metastore.py CHANGED Viewed

@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
 from datahub.metadata.schema_classes import (
-    ChangeTypeClass,
     DatasetPropertiesClass,
     SubTypesClass,
     ViewPropertiesClass,
@@ -161,7 +160,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
 @platform_name("Hive Metastore")
 @config_class(HiveMetastore)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+@capability(
+    SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
+)
 @capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
 @capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
 @capability(
@@ -599,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
                 yield dpi_aspect
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="subTypes",
                 aspect=SubTypesClass(typeNames=[self.table_subtype]),
             ).as_workunit()
@@ -808,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
             # Add views subtype
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="subTypes",
                 aspect=SubTypesClass(typeNames=[self.view_subtype]),
             ).as_workunit()
@@ -822,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
                 viewLogic=dataset.view_definition if dataset.view_definition else "",
             )
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="viewProperties",
                 aspect=view_properties_aspect,
             ).as_workunit()

datahub/ingestion/source/sql/mariadb.py CHANGED Viewed

@@ -15,7 +15,6 @@ from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
 class MariaDBSource(MySQLSource):
     def get_platform(self):
         return "mariadb"

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -27,6 +27,7 @@ from datahub.ingestion.api.decorators import (
 from datahub.ingestion.api.source import StructuredLogLevel
 from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.sql.mssql.job_models import (
     JobStep,
     MSSQLDataFlow,
@@ -174,7 +175,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+@capability(
+    SourceCapability.LINEAGE_COARSE,
+    "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
+    subtype_modifier=[
+        SourceCapabilityModifier.STORED_PROCEDURE,
+        SourceCapabilityModifier.VIEW,
+    ],
+)
+@capability(
+    SourceCapability.LINEAGE_FINE,
+    "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
+    subtype_modifier=[
+        SourceCapabilityModifier.STORED_PROCEDURE,
+        SourceCapabilityModifier.VIEW,
+    ],
+)
 class SQLServerSource(SQLAlchemySource):
     """
     This plugin extracts the following:
@@ -323,9 +339,11 @@ class SQLServerSource(SQLAlchemySource):
             try:
                 yield from self.loop_jobs(inspector, self.config)
             except Exception as e:
-                self.report.report_failure(
-                    "jobs",
-                    f"Failed to list jobs due to error {e}",
+                self.report.failure(
+                    message="Failed to list jobs",
+                    title="SQL Server Jobs Extraction",
+                    context="Error occurred during database-level job extraction",
+                    exc=e,
                 )
     def get_schema_level_workunits(
@@ -343,12 +361,158 @@ class SQLServerSource(SQLAlchemySource):
             try:
                 yield from self.loop_stored_procedures(inspector, schema, self.config)
             except Exception as e:
-                self.report.report_failure(
-                    "jobs",
-                    f"Failed to list jobs due to error {e}",
+                self.report.failure(
+                    message="Failed to list stored procedures",
+                    title="SQL Server Stored Procedures Extraction",
+                    context="Error occurred during schema-level stored procedure extraction",
+                    exc=e,
                 )
+    def _detect_rds_environment(self, conn: Connection) -> bool:
+        """
+        Detect if we're running in an RDS/managed environment vs on-premises.
+        Returns True if RDS/managed, False if on-premises.
+        """
+        try:
+            # Try to access system tables directly - this typically fails in RDS
+            conn.execute("SELECT TOP 1 * FROM msdb.dbo.sysjobs")
+            logger.debug(
+                "Direct table access successful - likely on-premises environment"
+            )
+            return False
+        except Exception:
+            logger.debug("Direct table access failed - likely RDS/managed environment")
+            return True
     def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
+        """
+        Get job information with environment detection to choose optimal method first.
+        """
+        jobs: Dict[str, Dict[str, Any]] = {}
+        # Detect environment to choose optimal method first
+        is_rds = self._detect_rds_environment(conn)
+        if is_rds:
+            # Managed environment - try stored procedures first
+            try:
+                jobs = self._get_jobs_via_stored_procedures(conn, db_name)
+                logger.info(
+                    "Successfully retrieved jobs using stored procedures (managed environment)"
+                )
+                return jobs
+            except Exception as sp_error:
+                logger.warning(
+                    f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
+                )
+                # Try direct query as fallback (might work in some managed environments)
+                try:
+                    jobs = self._get_jobs_via_direct_query(conn, db_name)
+                    logger.info(
+                        "Successfully retrieved jobs using direct query fallback in managed environment"
+                    )
+                    return jobs
+                except Exception as direct_error:
+                    self.report.failure(
+                        message="Failed to retrieve jobs in managed environment",
+                        title="SQL Server Jobs Extraction",
+                        context="Both stored procedures and direct query methods failed",
+                        exc=direct_error,
+                    )
+        else:
+            # On-premises environment - try direct query first (usually faster)
+            try:
+                jobs = self._get_jobs_via_direct_query(conn, db_name)
+                logger.info(
+                    "Successfully retrieved jobs using direct query (on-premises environment)"
+                )
+                return jobs
+            except Exception as direct_error:
+                logger.warning(
+                    f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
+                )
+                # Try stored procedures as fallback
+                try:
+                    jobs = self._get_jobs_via_stored_procedures(conn, db_name)
+                    logger.info(
+                        "Successfully retrieved jobs using stored procedures fallback in on-premises environment"
+                    )
+                    return jobs
+                except Exception as sp_error:
+                    self.report.failure(
+                        message="Failed to retrieve jobs in on-premises environment",
+                        title="SQL Server Jobs Extraction",
+                        context="Both direct query and stored procedures methods failed",
+                        exc=sp_error,
+                    )
+        return jobs
+    def _get_jobs_via_stored_procedures(
+        self, conn: Connection, db_name: str
+    ) -> Dict[str, Dict[str, Any]]:
+        jobs: Dict[str, Dict[str, Any]] = {}
+        # First, get all jobs
+        jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
+        jobs_data = {}
+        for row in jobs_result:
+            job_id = str(row["job_id"])
+            jobs_data[job_id] = {
+                "job_id": job_id,
+                "name": row["name"],
+                "description": row.get("description", ""),
+                "date_created": row.get("date_created"),
+                "date_modified": row.get("date_modified"),
+                "enabled": row.get("enabled", 1),
+            }
+        # Now get job steps for each job, filtering by database
+        for job_id, job_info in jobs_data.items():
+            try:
+                # Get steps for this specific job
+                steps_result = conn.execute(
+                    f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
+                )
+                job_steps = {}
+                for step_row in steps_result:
+                    # Only include steps that run against our target database
+                    step_database = step_row.get("database_name", "")
+                    if step_database.lower() == db_name.lower() or not step_database:
+                        step_data = {
+                            "job_id": job_id,
+                            "job_name": job_info["name"],
+                            "description": job_info["description"],
+                            "date_created": job_info["date_created"],
+                            "date_modified": job_info["date_modified"],
+                            "step_id": step_row["step_id"],
+                            "step_name": step_row["step_name"],
+                            "subsystem": step_row.get("subsystem", ""),
+                            "command": step_row.get("command", ""),
+                            "database_name": step_database,
+                        }
+                        job_steps[step_row["step_id"]] = step_data
+                # Only add job if it has relevant steps
+                if job_steps:
+                    jobs[job_info["name"]] = job_steps
+            except Exception as step_error:
+                logger.warning(
+                    f"Failed to get steps for job {job_info['name']}: {step_error}"
+                )
+                continue
+        return jobs
+    def _get_jobs_via_direct_query(
+        self, conn: Connection, db_name: str
+    ) -> Dict[str, Dict[str, Any]]:
+        """
+        Original method using direct table access for on-premises SQL Server.
+        """
         jobs_data = conn.execute(
             f"""
             SELECT
@@ -371,6 +535,7 @@ class SQLServerSource(SQLAlchemySource):
             where database_name = '{db_name}'
             """
         )
         jobs: Dict[str, Dict[str, Any]] = {}
         for row in jobs_data:
             step_data = dict(
@@ -383,11 +548,13 @@ class SQLServerSource(SQLAlchemySource):
                 step_name=row["step_name"],
                 subsystem=row["subsystem"],
                 command=row["command"],
+                database_name=row["database_name"],
             )
             if row["name"] in jobs:
                 jobs[row["name"]][row["step_id"]] = step_data
             else:
                 jobs[row["name"]] = {row["step_id"]: step_data}
         return jobs
     def loop_jobs(
@@ -397,21 +564,59 @@ class SQLServerSource(SQLAlchemySource):
     ) -> Iterable[MetadataWorkUnit]:
         """
         Loop MS SQL jobs as dataFlow-s.
-        :return:
+        Now supports both managed and on-premises SQL Server.
         """
         db_name = self.get_db_name(inspector)
-        with inspector.engine.connect() as conn:
-            jobs = self._get_jobs(conn, db_name)
-            for job_name, job_steps in jobs.items():
-                job = MSSQLJob(
-                    name=job_name,
-                    env=sql_config.env,
-                    db=db_name,
-                    platform_instance=sql_config.platform_instance,
+        try:
+            with inspector.engine.connect() as conn:
+                jobs = self._get_jobs(conn, db_name)
+                if not jobs:
+                    logger.info(f"No jobs found for database: {db_name}")
+                    return
+                logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
+                for job_name, job_steps in jobs.items():
+                    try:
+                        job = MSSQLJob(
+                            name=job_name,
+                            env=sql_config.env,
+                            db=db_name,
+                            platform_instance=sql_config.platform_instance,
+                        )
+                        data_flow = MSSQLDataFlow(entity=job)
+                        yield from self.construct_flow_workunits(data_flow=data_flow)
+                        yield from self.loop_job_steps(job, job_steps)
+                    except Exception as job_error:
+                        logger.warning(f"Failed to process job {job_name}: {job_error}")
+                        self.report.warning(
+                            message=f"Failed to process job {job_name}",
+                            title="SQL Server Jobs Extraction",
+                            context="Error occurred while processing individual job",
+                            exc=job_error,
+                        )
+                        continue
+        except Exception as e:
+            error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
+            logger.error(error_message)
+            # Provide specific guidance for permission issues
+            if "permission" in str(e).lower() or "denied" in str(e).lower():
+                permission_guidance = (
+                    "For managed SQL Server services, ensure the following permissions are granted:\n"
+                    "GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
+                    "GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
+                    "For on-premises SQL Server, you may also need:\n"
+                    "GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
+                    "GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
                 )
-                data_flow = MSSQLDataFlow(entity=job)
-                yield from self.construct_flow_workunits(data_flow=data_flow)
-                yield from self.loop_job_steps(job, job_steps)
+                logger.info(permission_guidance)
+            raise e
     def loop_job_steps(
         self, job: MSSQLJob, job_steps: Dict[str, Any]
@@ -740,25 +945,25 @@ class SQLServerSource(SQLAlchemySource):
         url = self.config.get_sql_alchemy_url()
         logger.debug(f"sql_alchemy_url={url}")
         engine = create_engine(url, **self.config.options)
-        with engine.connect() as conn:
-            if self.config.database and self.config.database != "":
-                inspector = inspect(conn)
-                yield inspector
-            else:
+        if self.config.database and self.config.database != "":
+            inspector = inspect(engine)
+            yield inspector
+        else:
+            with engine.begin() as conn:
                 databases = conn.execute(
                     "SELECT name FROM master.sys.databases WHERE name NOT IN \
                   ('master', 'model', 'msdb', 'tempdb', 'Resource', \
                        'distribution' , 'reportserver', 'reportservertempdb'); "
-                )
-                for db in databases:
-                    if self.config.database_pattern.allowed(db["name"]):
-                        url = self.config.get_sql_alchemy_url(current_db=db["name"])
-                        with create_engine(
-                            url, **self.config.options
-                        ).connect() as conn:
-                            inspector = inspect(conn)
-                            self.current_database = db["name"]
-                            yield inspector
+                ).fetchall()
+            for db in databases:
+                if self.config.database_pattern.allowed(db["name"]):
+                    url = self.config.get_sql_alchemy_url(current_db=db["name"])
+                    engine = create_engine(url, **self.config.options)
+                    inspector = inspect(engine)
+                    self.current_database = db["name"]
+                    yield inspector
     def get_identifier(
         self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any

datahub/ingestion/source/sql/mysql.py CHANGED Viewed

@@ -65,7 +65,6 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
 class MySQLSource(TwoTierSQLAlchemySource):
     """
     This plugin extracts the following:

datahub/ingestion/source/sql/oracle.py CHANGED Viewed

@@ -441,7 +441,7 @@ class OracleInspectorObjectWrapper:
             "\nac.constraint_name,"
             "\nac.constraint_type,"
             "\nacc.column_name AS local_column,"
-            "\nac.r_table_name AS remote_table,"
+            "\nac.table_name AS remote_table,"
             "\nrcc.column_name AS remote_column,"
             "\nac.r_owner AS remote_owner,"
             "\nacc.position AS loc_pos,"

datahub/ingestion/source/sql/postgres.py CHANGED Viewed

@@ -131,7 +131,6 @@ class PostgresConfig(BasePostgresConfig):
 @capability(SourceCapability.DOMAINS, "Enabled by default")
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
 class PostgresSource(SQLAlchemySource):
     """
     This plugin extracts the following:

acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc3py3-none-any.whl → 1.2.0py3-none-any.whl