acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -379,7 +379,9 @@ clickhouse_datetime_format = "%Y-%m-%d %H:%M:%S"
379
379
  @platform_name("ClickHouse")
380
380
  @config_class(ClickHouseConfig)
381
381
  @support_status(SupportStatus.CERTIFIED)
382
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
382
+ @capability(
383
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
384
+ )
383
385
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
384
386
  class ClickHouseSource(TwoTierSQLAlchemySource):
385
387
  """
@@ -26,7 +26,6 @@ class CockroachDBConfig(PostgresConfig):
26
26
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
27
27
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
28
28
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
29
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
30
29
  class CockroachDBSource(PostgresSource):
31
30
  config: CockroachDBConfig
32
31
 
@@ -27,7 +27,9 @@ class HanaConfig(BasicSQLAlchemyConfig):
27
27
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
28
28
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
29
29
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
30
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
30
+ @capability(
31
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
32
+ )
31
33
  class HanaSource(SQLAlchemySource):
32
34
  def __init__(self, config: HanaConfig, ctx: PipelineContext):
33
35
  super().__init__(config, ctx, "hana")
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
54
54
  from datahub.metadata.schema_classes import (
55
- ChangeTypeClass,
56
55
  DatasetPropertiesClass,
57
56
  SubTypesClass,
58
57
  ViewPropertiesClass,
@@ -161,7 +160,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
161
160
  @platform_name("Hive Metastore")
162
161
  @config_class(HiveMetastore)
163
162
  @support_status(SupportStatus.CERTIFIED)
164
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
163
+ @capability(
164
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
165
+ )
165
166
  @capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
166
167
  @capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
167
168
  @capability(
@@ -599,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
599
600
  yield dpi_aspect
600
601
 
601
602
  yield MetadataChangeProposalWrapper(
602
- entityType="dataset",
603
- changeType=ChangeTypeClass.UPSERT,
604
603
  entityUrn=dataset_urn,
605
- aspectName="subTypes",
606
604
  aspect=SubTypesClass(typeNames=[self.table_subtype]),
607
605
  ).as_workunit()
608
606
 
@@ -808,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
808
806
 
809
807
  # Add views subtype
810
808
  yield MetadataChangeProposalWrapper(
811
- entityType="dataset",
812
- changeType=ChangeTypeClass.UPSERT,
813
809
  entityUrn=dataset_urn,
814
- aspectName="subTypes",
815
810
  aspect=SubTypesClass(typeNames=[self.view_subtype]),
816
811
  ).as_workunit()
817
812
 
@@ -822,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
822
817
  viewLogic=dataset.view_definition if dataset.view_definition else "",
823
818
  )
824
819
  yield MetadataChangeProposalWrapper(
825
- entityType="dataset",
826
- changeType=ChangeTypeClass.UPSERT,
827
820
  entityUrn=dataset_urn,
828
- aspectName="viewProperties",
829
821
  aspect=view_properties_aspect,
830
822
  ).as_workunit()
831
823
 
@@ -15,7 +15,6 @@ from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource
15
15
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
16
16
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
17
17
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
18
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
19
18
  class MariaDBSource(MySQLSource):
20
19
  def get_platform(self):
21
20
  return "mariadb"
@@ -27,6 +27,7 @@ from datahub.ingestion.api.decorators import (
27
27
  from datahub.ingestion.api.source import StructuredLogLevel
28
28
  from datahub.ingestion.api.source_helpers import auto_workunit
29
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
30
31
  from datahub.ingestion.source.sql.mssql.job_models import (
31
32
  JobStep,
32
33
  MSSQLDataFlow,
@@ -174,7 +175,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
174
175
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
175
176
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
176
177
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
177
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
178
+ @capability(
179
+ SourceCapability.LINEAGE_COARSE,
180
+ "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
181
+ subtype_modifier=[
182
+ SourceCapabilityModifier.STORED_PROCEDURE,
183
+ SourceCapabilityModifier.VIEW,
184
+ ],
185
+ )
186
+ @capability(
187
+ SourceCapability.LINEAGE_FINE,
188
+ "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
189
+ subtype_modifier=[
190
+ SourceCapabilityModifier.STORED_PROCEDURE,
191
+ SourceCapabilityModifier.VIEW,
192
+ ],
193
+ )
178
194
  class SQLServerSource(SQLAlchemySource):
179
195
  """
180
196
  This plugin extracts the following:
@@ -323,9 +339,11 @@ class SQLServerSource(SQLAlchemySource):
323
339
  try:
324
340
  yield from self.loop_jobs(inspector, self.config)
325
341
  except Exception as e:
326
- self.report.report_failure(
327
- "jobs",
328
- f"Failed to list jobs due to error {e}",
342
+ self.report.failure(
343
+ message="Failed to list jobs",
344
+ title="SQL Server Jobs Extraction",
345
+ context="Error occurred during database-level job extraction",
346
+ exc=e,
329
347
  )
330
348
 
331
349
  def get_schema_level_workunits(
@@ -343,12 +361,158 @@ class SQLServerSource(SQLAlchemySource):
343
361
  try:
344
362
  yield from self.loop_stored_procedures(inspector, schema, self.config)
345
363
  except Exception as e:
346
- self.report.report_failure(
347
- "jobs",
348
- f"Failed to list jobs due to error {e}",
364
+ self.report.failure(
365
+ message="Failed to list stored procedures",
366
+ title="SQL Server Stored Procedures Extraction",
367
+ context="Error occurred during schema-level stored procedure extraction",
368
+ exc=e,
349
369
  )
350
370
 
371
+ def _detect_rds_environment(self, conn: Connection) -> bool:
372
+ """
373
+ Detect if we're running in an RDS/managed environment vs on-premises.
374
+ Returns True if RDS/managed, False if on-premises.
375
+ """
376
+ try:
377
+ # Try to access system tables directly - this typically fails in RDS
378
+ conn.execute("SELECT TOP 1 * FROM msdb.dbo.sysjobs")
379
+ logger.debug(
380
+ "Direct table access successful - likely on-premises environment"
381
+ )
382
+ return False
383
+ except Exception:
384
+ logger.debug("Direct table access failed - likely RDS/managed environment")
385
+ return True
386
+
351
387
  def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
388
+ """
389
+ Get job information with environment detection to choose optimal method first.
390
+ """
391
+ jobs: Dict[str, Dict[str, Any]] = {}
392
+
393
+ # Detect environment to choose optimal method first
394
+ is_rds = self._detect_rds_environment(conn)
395
+
396
+ if is_rds:
397
+ # Managed environment - try stored procedures first
398
+ try:
399
+ jobs = self._get_jobs_via_stored_procedures(conn, db_name)
400
+ logger.info(
401
+ "Successfully retrieved jobs using stored procedures (managed environment)"
402
+ )
403
+ return jobs
404
+ except Exception as sp_error:
405
+ logger.warning(
406
+ f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
407
+ )
408
+ # Try direct query as fallback (might work in some managed environments)
409
+ try:
410
+ jobs = self._get_jobs_via_direct_query(conn, db_name)
411
+ logger.info(
412
+ "Successfully retrieved jobs using direct query fallback in managed environment"
413
+ )
414
+ return jobs
415
+ except Exception as direct_error:
416
+ self.report.failure(
417
+ message="Failed to retrieve jobs in managed environment",
418
+ title="SQL Server Jobs Extraction",
419
+ context="Both stored procedures and direct query methods failed",
420
+ exc=direct_error,
421
+ )
422
+ else:
423
+ # On-premises environment - try direct query first (usually faster)
424
+ try:
425
+ jobs = self._get_jobs_via_direct_query(conn, db_name)
426
+ logger.info(
427
+ "Successfully retrieved jobs using direct query (on-premises environment)"
428
+ )
429
+ return jobs
430
+ except Exception as direct_error:
431
+ logger.warning(
432
+ f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
433
+ )
434
+ # Try stored procedures as fallback
435
+ try:
436
+ jobs = self._get_jobs_via_stored_procedures(conn, db_name)
437
+ logger.info(
438
+ "Successfully retrieved jobs using stored procedures fallback in on-premises environment"
439
+ )
440
+ return jobs
441
+ except Exception as sp_error:
442
+ self.report.failure(
443
+ message="Failed to retrieve jobs in on-premises environment",
444
+ title="SQL Server Jobs Extraction",
445
+ context="Both direct query and stored procedures methods failed",
446
+ exc=sp_error,
447
+ )
448
+
449
+ return jobs
450
+
451
+ def _get_jobs_via_stored_procedures(
452
+ self, conn: Connection, db_name: str
453
+ ) -> Dict[str, Dict[str, Any]]:
454
+ jobs: Dict[str, Dict[str, Any]] = {}
455
+
456
+ # First, get all jobs
457
+ jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
458
+ jobs_data = {}
459
+
460
+ for row in jobs_result:
461
+ job_id = str(row["job_id"])
462
+ jobs_data[job_id] = {
463
+ "job_id": job_id,
464
+ "name": row["name"],
465
+ "description": row.get("description", ""),
466
+ "date_created": row.get("date_created"),
467
+ "date_modified": row.get("date_modified"),
468
+ "enabled": row.get("enabled", 1),
469
+ }
470
+
471
+ # Now get job steps for each job, filtering by database
472
+ for job_id, job_info in jobs_data.items():
473
+ try:
474
+ # Get steps for this specific job
475
+ steps_result = conn.execute(
476
+ f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
477
+ )
478
+
479
+ job_steps = {}
480
+ for step_row in steps_result:
481
+ # Only include steps that run against our target database
482
+ step_database = step_row.get("database_name", "")
483
+ if step_database.lower() == db_name.lower() or not step_database:
484
+ step_data = {
485
+ "job_id": job_id,
486
+ "job_name": job_info["name"],
487
+ "description": job_info["description"],
488
+ "date_created": job_info["date_created"],
489
+ "date_modified": job_info["date_modified"],
490
+ "step_id": step_row["step_id"],
491
+ "step_name": step_row["step_name"],
492
+ "subsystem": step_row.get("subsystem", ""),
493
+ "command": step_row.get("command", ""),
494
+ "database_name": step_database,
495
+ }
496
+ job_steps[step_row["step_id"]] = step_data
497
+
498
+ # Only add job if it has relevant steps
499
+ if job_steps:
500
+ jobs[job_info["name"]] = job_steps
501
+
502
+ except Exception as step_error:
503
+ logger.warning(
504
+ f"Failed to get steps for job {job_info['name']}: {step_error}"
505
+ )
506
+ continue
507
+
508
+ return jobs
509
+
510
+ def _get_jobs_via_direct_query(
511
+ self, conn: Connection, db_name: str
512
+ ) -> Dict[str, Dict[str, Any]]:
513
+ """
514
+ Original method using direct table access for on-premises SQL Server.
515
+ """
352
516
  jobs_data = conn.execute(
353
517
  f"""
354
518
  SELECT
@@ -371,6 +535,7 @@ class SQLServerSource(SQLAlchemySource):
371
535
  where database_name = '{db_name}'
372
536
  """
373
537
  )
538
+
374
539
  jobs: Dict[str, Dict[str, Any]] = {}
375
540
  for row in jobs_data:
376
541
  step_data = dict(
@@ -383,11 +548,13 @@ class SQLServerSource(SQLAlchemySource):
383
548
  step_name=row["step_name"],
384
549
  subsystem=row["subsystem"],
385
550
  command=row["command"],
551
+ database_name=row["database_name"],
386
552
  )
387
553
  if row["name"] in jobs:
388
554
  jobs[row["name"]][row["step_id"]] = step_data
389
555
  else:
390
556
  jobs[row["name"]] = {row["step_id"]: step_data}
557
+
391
558
  return jobs
392
559
 
393
560
  def loop_jobs(
@@ -397,21 +564,59 @@ class SQLServerSource(SQLAlchemySource):
397
564
  ) -> Iterable[MetadataWorkUnit]:
398
565
  """
399
566
  Loop MS SQL jobs as dataFlow-s.
400
- :return:
567
+ Now supports both managed and on-premises SQL Server.
401
568
  """
402
569
  db_name = self.get_db_name(inspector)
403
- with inspector.engine.connect() as conn:
404
- jobs = self._get_jobs(conn, db_name)
405
- for job_name, job_steps in jobs.items():
406
- job = MSSQLJob(
407
- name=job_name,
408
- env=sql_config.env,
409
- db=db_name,
410
- platform_instance=sql_config.platform_instance,
570
+
571
+ try:
572
+ with inspector.engine.connect() as conn:
573
+ jobs = self._get_jobs(conn, db_name)
574
+
575
+ if not jobs:
576
+ logger.info(f"No jobs found for database: {db_name}")
577
+ return
578
+
579
+ logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
580
+
581
+ for job_name, job_steps in jobs.items():
582
+ try:
583
+ job = MSSQLJob(
584
+ name=job_name,
585
+ env=sql_config.env,
586
+ db=db_name,
587
+ platform_instance=sql_config.platform_instance,
588
+ )
589
+ data_flow = MSSQLDataFlow(entity=job)
590
+ yield from self.construct_flow_workunits(data_flow=data_flow)
591
+ yield from self.loop_job_steps(job, job_steps)
592
+
593
+ except Exception as job_error:
594
+ logger.warning(f"Failed to process job {job_name}: {job_error}")
595
+ self.report.warning(
596
+ message=f"Failed to process job {job_name}",
597
+ title="SQL Server Jobs Extraction",
598
+ context="Error occurred while processing individual job",
599
+ exc=job_error,
600
+ )
601
+ continue
602
+
603
+ except Exception as e:
604
+ error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
605
+ logger.error(error_message)
606
+
607
+ # Provide specific guidance for permission issues
608
+ if "permission" in str(e).lower() or "denied" in str(e).lower():
609
+ permission_guidance = (
610
+ "For managed SQL Server services, ensure the following permissions are granted:\n"
611
+ "GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
612
+ "GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
613
+ "For on-premises SQL Server, you may also need:\n"
614
+ "GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
615
+ "GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
411
616
  )
412
- data_flow = MSSQLDataFlow(entity=job)
413
- yield from self.construct_flow_workunits(data_flow=data_flow)
414
- yield from self.loop_job_steps(job, job_steps)
617
+ logger.info(permission_guidance)
618
+
619
+ raise e
415
620
 
416
621
  def loop_job_steps(
417
622
  self, job: MSSQLJob, job_steps: Dict[str, Any]
@@ -740,25 +945,25 @@ class SQLServerSource(SQLAlchemySource):
740
945
  url = self.config.get_sql_alchemy_url()
741
946
  logger.debug(f"sql_alchemy_url={url}")
742
947
  engine = create_engine(url, **self.config.options)
743
- with engine.connect() as conn:
744
- if self.config.database and self.config.database != "":
745
- inspector = inspect(conn)
746
- yield inspector
747
- else:
948
+
949
+ if self.config.database and self.config.database != "":
950
+ inspector = inspect(engine)
951
+ yield inspector
952
+ else:
953
+ with engine.begin() as conn:
748
954
  databases = conn.execute(
749
955
  "SELECT name FROM master.sys.databases WHERE name NOT IN \
750
956
  ('master', 'model', 'msdb', 'tempdb', 'Resource', \
751
957
  'distribution' , 'reportserver', 'reportservertempdb'); "
752
- )
753
- for db in databases:
754
- if self.config.database_pattern.allowed(db["name"]):
755
- url = self.config.get_sql_alchemy_url(current_db=db["name"])
756
- with create_engine(
757
- url, **self.config.options
758
- ).connect() as conn:
759
- inspector = inspect(conn)
760
- self.current_database = db["name"]
761
- yield inspector
958
+ ).fetchall()
959
+
960
+ for db in databases:
961
+ if self.config.database_pattern.allowed(db["name"]):
962
+ url = self.config.get_sql_alchemy_url(current_db=db["name"])
963
+ engine = create_engine(url, **self.config.options)
964
+ inspector = inspect(engine)
965
+ self.current_database = db["name"]
966
+ yield inspector
762
967
 
763
968
  def get_identifier(
764
969
  self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
@@ -65,7 +65,6 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
65
65
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
66
66
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
67
67
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
68
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
69
68
  class MySQLSource(TwoTierSQLAlchemySource):
70
69
  """
71
70
  This plugin extracts the following:
@@ -441,7 +441,7 @@ class OracleInspectorObjectWrapper:
441
441
  "\nac.constraint_name,"
442
442
  "\nac.constraint_type,"
443
443
  "\nacc.column_name AS local_column,"
444
- "\nac.r_table_name AS remote_table,"
444
+ "\nac.table_name AS remote_table,"
445
445
  "\nrcc.column_name AS remote_column,"
446
446
  "\nac.r_owner AS remote_owner,"
447
447
  "\nacc.position AS loc_pos,"
@@ -131,7 +131,6 @@ class PostgresConfig(BasePostgresConfig):
131
131
  @capability(SourceCapability.DOMAINS, "Enabled by default")
132
132
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
133
133
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
134
- @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
135
134
  class PostgresSource(SQLAlchemySource):
136
135
  """
137
136
  This plugin extracts the following: