acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import re
3
3
  import urllib.parse
4
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
4
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
5
5
 
6
6
  import pydantic
7
7
  import sqlalchemy.dialects.mssql
@@ -10,9 +10,11 @@ from sqlalchemy import create_engine, inspect
10
10
  from sqlalchemy.engine.base import Connection
11
11
  from sqlalchemy.engine.reflection import Inspector
12
12
  from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
+ from sqlalchemy.sql import quoted_name
13
14
 
14
15
  import datahub.metadata.schema_classes as models
15
- from datahub.configuration.common import AllowDenyPattern
16
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
17
+ from datahub.configuration.pattern_utils import UUID_REGEX
16
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
19
  from datahub.ingestion.api.common import PipelineContext
18
20
  from datahub.ingestion.api.decorators import (
@@ -26,6 +28,7 @@ from datahub.ingestion.api.decorators import (
26
28
  from datahub.ingestion.api.source import StructuredLogLevel
27
29
  from datahub.ingestion.api.source_helpers import auto_workunit
28
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
29
32
  from datahub.ingestion.source.sql.mssql.job_models import (
30
33
  JobStep,
31
34
  MSSQLDataFlow,
@@ -37,19 +40,18 @@ from datahub.ingestion.source.sql.mssql.job_models import (
37
40
  ProcedureParameter,
38
41
  StoredProcedure,
39
42
  )
40
- from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
41
- generate_procedure_lineage,
42
- )
43
43
  from datahub.ingestion.source.sql.sql_common import (
44
44
  SQLAlchemySource,
45
- SqlWorkUnit,
46
45
  register_custom_type,
47
46
  )
48
47
  from datahub.ingestion.source.sql.sql_config import (
49
48
  BasicSQLAlchemyConfig,
50
- make_sqlalchemy_uri,
51
49
  )
52
50
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
51
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
52
+ from datahub.ingestion.source.sql.stored_procedures.base import (
53
+ generate_procedure_lineage,
54
+ )
53
55
  from datahub.utilities.file_backed_collections import FileBackedList
54
56
 
55
57
  logger: logging.Logger = logging.getLogger(__name__)
@@ -60,11 +62,22 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
60
62
  register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
61
63
  register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
62
64
 
65
+ # Patterns copied from Snowflake source
66
+ DEFAULT_TEMP_TABLES_PATTERNS = [
67
+ r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
68
+ r".*__DBT_TMP$", # dbt
69
+ rf".*\.SEGMENT_{UUID_REGEX}", # segment
70
+ rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
71
+ r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
72
+ ]
73
+
63
74
 
64
75
  class SQLServerConfig(BasicSQLAlchemyConfig):
65
76
  # defaults
66
77
  host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
67
- scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
78
+ scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
79
+
80
+ # TODO: rename to include_procedures ?
68
81
  include_stored_procedures: bool = Field(
69
82
  default=True,
70
83
  description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
@@ -112,10 +125,24 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
112
125
  default=False,
113
126
  description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
114
127
  )
128
+ temporary_tables_pattern: List[str] = Field(
129
+ default=DEFAULT_TEMP_TABLES_PATTERNS,
130
+ description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
131
+ "match the entire table name in database.schema.table format. Defaults are to set in such a way "
132
+ "to ignore the temporary staging tables created by known ETL tools.",
133
+ )
134
+ quote_schemas: bool = Field(
135
+ default=False,
136
+ description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
137
+ )
138
+ is_aws_rds: Optional[bool] = Field(
139
+ default=None,
140
+ description="Indicates if the SQL Server instance is running on AWS RDS. When None (default), automatic detection will be attempted using server name analysis.",
141
+ )
115
142
 
116
143
  @pydantic.validator("uri_args")
117
144
  def passwords_match(cls, v, values, **kwargs):
118
- if values["use_odbc"] and "driver" not in v:
145
+ if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
119
146
  raise ValueError("uri_args must contain a 'driver' option")
120
147
  elif not values["use_odbc"] and v:
121
148
  raise ValueError("uri_args is not supported when ODBC is disabled")
@@ -126,22 +153,36 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
126
153
  uri_opts: Optional[Dict[str, Any]] = None,
127
154
  current_db: Optional[str] = None,
128
155
  ) -> str:
156
+ current_db = current_db or self.database
157
+
129
158
  if self.use_odbc:
130
159
  # Ensure that the import is available.
131
160
  import pyodbc # noqa: F401
132
161
 
133
162
  self.scheme = "mssql+pyodbc"
134
163
 
164
+ # ODBC requires a database name, otherwise it will interpret host_port
165
+ # as a pre-defined ODBC connection name.
166
+ current_db = current_db or "master"
167
+
135
168
  uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
136
169
  self.scheme, # type: ignore
137
170
  self.username,
138
171
  self.password.get_secret_value() if self.password else None,
139
172
  self.host_port, # type: ignore
140
- current_db if current_db else self.database,
173
+ current_db,
141
174
  uri_opts=uri_opts,
142
175
  )
143
176
  if self.use_odbc:
144
- uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}"
177
+ final_uri_args = self.uri_args.copy()
178
+ if final_uri_args and current_db:
179
+ final_uri_args.update({"database": current_db})
180
+
181
+ uri = (
182
+ f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
183
+ if final_uri_args
184
+ else uri
185
+ )
145
186
  return uri
146
187
 
147
188
  @property
@@ -156,7 +197,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
156
197
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
157
198
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
158
199
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
159
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
200
+ @capability(
201
+ SourceCapability.LINEAGE_COARSE,
202
+ "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
203
+ subtype_modifier=[
204
+ SourceCapabilityModifier.STORED_PROCEDURE,
205
+ SourceCapabilityModifier.VIEW,
206
+ ],
207
+ )
208
+ @capability(
209
+ SourceCapability.LINEAGE_FINE,
210
+ "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
211
+ subtype_modifier=[
212
+ SourceCapabilityModifier.STORED_PROCEDURE,
213
+ SourceCapabilityModifier.VIEW,
214
+ ],
215
+ )
160
216
  class SQLServerSource(SQLAlchemySource):
161
217
  """
162
218
  This plugin extracts the following:
@@ -177,6 +233,14 @@ class SQLServerSource(SQLAlchemySource):
177
233
  self.table_descriptions: Dict[str, str] = {}
178
234
  self.column_descriptions: Dict[str, str] = {}
179
235
  self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
236
+
237
+ self.report = SQLSourceReport()
238
+ if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
239
+ self.report.warning(
240
+ title="Potential issue with lineage",
241
+ message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
242
+ )
243
+
180
244
  if self.config.include_descriptions:
181
245
  for inspector in self.get_inspectors():
182
246
  db_name: str = self.get_db_name(inspector)
@@ -297,32 +361,186 @@ class SQLServerSource(SQLAlchemySource):
297
361
  try:
298
362
  yield from self.loop_jobs(inspector, self.config)
299
363
  except Exception as e:
300
- self.report.report_failure(
301
- "jobs",
302
- f"Failed to list jobs due to error {e}",
364
+ self.report.failure(
365
+ message="Failed to list jobs",
366
+ title="SQL Server Jobs Extraction",
367
+ context="Error occurred during database-level job extraction",
368
+ exc=e,
303
369
  )
304
370
 
305
- def get_schema_level_workunits(
306
- self,
307
- inspector: Inspector,
308
- schema: str,
309
- database: str,
310
- ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
311
- yield from super().get_schema_level_workunits(
312
- inspector=inspector,
313
- schema=schema,
314
- database=database,
315
- )
316
- if self.config.include_stored_procedures:
317
- try:
318
- yield from self.loop_stored_procedures(inspector, schema, self.config)
319
- except Exception as e:
320
- self.report.report_failure(
321
- "jobs",
322
- f"Failed to list jobs due to error {e}",
371
+ def _detect_rds_environment(self, conn: Connection) -> bool:
372
+ """
373
+ Detect if we're running in an RDS/managed environment vs on-premises.
374
+ Uses explicit configuration if provided, otherwise attempts automatic detection.
375
+ Returns True if RDS/managed, False if on-premises.
376
+ """
377
+ if self.config.is_aws_rds is not None:
378
+ logger.info(
379
+ f"Using explicit is_aws_rds configuration: {self.config.is_aws_rds}"
380
+ )
381
+ return self.config.is_aws_rds
382
+
383
+ try:
384
+ result = conn.execute("SELECT @@servername AS server_name")
385
+ server_name_row = result.fetchone()
386
+ if server_name_row:
387
+ server_name = server_name_row["server_name"].lower()
388
+
389
+ aws_indicators = ["amazon", "amzn", "amaz", "ec2", "rds.amazonaws.com"]
390
+ is_rds = any(indicator in server_name for indicator in aws_indicators)
391
+ if is_rds:
392
+ logger.info(f"AWS RDS detected based on server name: {server_name}")
393
+ else:
394
+ logger.info(
395
+ f"Non-RDS environment detected based on server name: {server_name}"
396
+ )
397
+
398
+ return is_rds
399
+ else:
400
+ logger.warning(
401
+ "Could not retrieve server name, assuming non-RDS environment"
323
402
  )
403
+ return False
404
+
405
+ except Exception as e:
406
+ logger.warning(
407
+ f"Failed to detect RDS/managed vs on-prem env, assuming non-RDS environment ({e})"
408
+ )
409
+ return False
324
410
 
325
411
  def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
412
+ """
413
+ Get job information with environment detection to choose optimal method first.
414
+ """
415
+ jobs: Dict[str, Dict[str, Any]] = {}
416
+
417
+ # Detect environment to choose optimal method first
418
+ is_rds = self._detect_rds_environment(conn)
419
+
420
+ if is_rds:
421
+ # Managed environment - try stored procedures first
422
+ try:
423
+ jobs = self._get_jobs_via_stored_procedures(conn, db_name)
424
+ logger.info(
425
+ "Successfully retrieved jobs using stored procedures (managed environment)"
426
+ )
427
+ return jobs
428
+ except Exception as sp_error:
429
+ logger.warning(
430
+ f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
431
+ )
432
+ # Try direct query as fallback (might work in some managed environments)
433
+ try:
434
+ jobs = self._get_jobs_via_direct_query(conn, db_name)
435
+ logger.info(
436
+ "Successfully retrieved jobs using direct query fallback in managed environment"
437
+ )
438
+ return jobs
439
+ except Exception as direct_error:
440
+ self.report.failure(
441
+ message="Failed to retrieve jobs in managed environment",
442
+ title="SQL Server Jobs Extraction",
443
+ context="Both stored procedures and direct query methods failed",
444
+ exc=direct_error,
445
+ )
446
+ else:
447
+ # On-premises environment - try direct query first (usually faster)
448
+ try:
449
+ jobs = self._get_jobs_via_direct_query(conn, db_name)
450
+ logger.info(
451
+ "Successfully retrieved jobs using direct query (on-premises environment)"
452
+ )
453
+ return jobs
454
+ except Exception as direct_error:
455
+ logger.warning(
456
+ f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
457
+ )
458
+ # Try stored procedures as fallback
459
+ try:
460
+ jobs = self._get_jobs_via_stored_procedures(conn, db_name)
461
+ logger.info(
462
+ "Successfully retrieved jobs using stored procedures fallback in on-premises environment"
463
+ )
464
+ return jobs
465
+ except Exception as sp_error:
466
+ self.report.failure(
467
+ message="Failed to retrieve jobs in on-premises environment",
468
+ title="SQL Server Jobs Extraction",
469
+ context="Both direct query and stored procedures methods failed",
470
+ exc=sp_error,
471
+ )
472
+
473
+ return jobs
474
+
475
+ def _get_jobs_via_stored_procedures(
476
+ self, conn: Connection, db_name: str
477
+ ) -> Dict[str, Dict[str, Any]]:
478
+ jobs: Dict[str, Dict[str, Any]] = {}
479
+
480
+ # First, get all jobs
481
+ jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
482
+ jobs_data = {}
483
+
484
+ # SQLAlchemy 1.3 support was dropped in Sept 2023 (PR #8810)
485
+ # SQLAlchemy 1.4+ returns LegacyRow objects that don't support dictionary-style .get() method
486
+ # Use .mappings() to get MappingResult with dictionary-like rows that support .get()
487
+ for row in jobs_result.mappings():
488
+ job_id = str(row["job_id"])
489
+ jobs_data[job_id] = {
490
+ "job_id": job_id,
491
+ "name": row["name"],
492
+ "description": row.get("description", ""),
493
+ "date_created": row.get("date_created"),
494
+ "date_modified": row.get("date_modified"),
495
+ "enabled": row.get("enabled", 1),
496
+ }
497
+
498
+ # Now get job steps for each job, filtering by database
499
+ for job_id, job_info in jobs_data.items():
500
+ try:
501
+ # Get steps for this specific job
502
+ steps_result = conn.execute(
503
+ f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
504
+ )
505
+
506
+ job_steps = {}
507
+ # Use .mappings() for dictionary-like access (SQLAlchemy 1.4+ compatibility)
508
+ for step_row in steps_result.mappings():
509
+ # Only include steps that run against our target database
510
+ step_database = step_row.get("database_name", "")
511
+ if step_database.lower() == db_name.lower() or not step_database:
512
+ step_data = {
513
+ "job_id": job_id,
514
+ "job_name": job_info["name"],
515
+ "description": job_info["description"],
516
+ "date_created": job_info["date_created"],
517
+ "date_modified": job_info["date_modified"],
518
+ "step_id": step_row["step_id"],
519
+ "step_name": step_row["step_name"],
520
+ "subsystem": step_row.get("subsystem", ""),
521
+ "command": step_row.get("command", ""),
522
+ "database_name": step_database,
523
+ }
524
+ job_steps[step_row["step_id"]] = step_data
525
+
526
+ # Only add job if it has relevant steps
527
+ if job_steps:
528
+ jobs[job_info["name"]] = job_steps
529
+
530
+ except Exception as step_error:
531
+ logger.warning(
532
+ f"Failed to get steps for job {job_info['name']}: {step_error}"
533
+ )
534
+ continue
535
+
536
+ return jobs
537
+
538
+ def _get_jobs_via_direct_query(
539
+ self, conn: Connection, db_name: str
540
+ ) -> Dict[str, Dict[str, Any]]:
541
+ """
542
+ Original method using direct table access for on-premises SQL Server.
543
+ """
326
544
  jobs_data = conn.execute(
327
545
  f"""
328
546
  SELECT
@@ -345,6 +563,7 @@ class SQLServerSource(SQLAlchemySource):
345
563
  where database_name = '{db_name}'
346
564
  """
347
565
  )
566
+
348
567
  jobs: Dict[str, Dict[str, Any]] = {}
349
568
  for row in jobs_data:
350
569
  step_data = dict(
@@ -357,11 +576,13 @@ class SQLServerSource(SQLAlchemySource):
357
576
  step_name=row["step_name"],
358
577
  subsystem=row["subsystem"],
359
578
  command=row["command"],
579
+ database_name=row["database_name"],
360
580
  )
361
581
  if row["name"] in jobs:
362
582
  jobs[row["name"]][row["step_id"]] = step_data
363
583
  else:
364
584
  jobs[row["name"]] = {row["step_id"]: step_data}
585
+
365
586
  return jobs
366
587
 
367
588
  def loop_jobs(
@@ -371,21 +592,59 @@ class SQLServerSource(SQLAlchemySource):
371
592
  ) -> Iterable[MetadataWorkUnit]:
372
593
  """
373
594
  Loop MS SQL jobs as dataFlow-s.
374
- :return:
595
+ Now supports both managed and on-premises SQL Server.
375
596
  """
376
597
  db_name = self.get_db_name(inspector)
377
- with inspector.engine.connect() as conn:
378
- jobs = self._get_jobs(conn, db_name)
379
- for job_name, job_steps in jobs.items():
380
- job = MSSQLJob(
381
- name=job_name,
382
- env=sql_config.env,
383
- db=db_name,
384
- platform_instance=sql_config.platform_instance,
598
+
599
+ try:
600
+ with inspector.engine.connect() as conn:
601
+ jobs = self._get_jobs(conn, db_name)
602
+
603
+ if not jobs:
604
+ logger.info(f"No jobs found for database: {db_name}")
605
+ return
606
+
607
+ logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
608
+
609
+ for job_name, job_steps in jobs.items():
610
+ try:
611
+ job = MSSQLJob(
612
+ name=job_name,
613
+ env=sql_config.env,
614
+ db=db_name,
615
+ platform_instance=sql_config.platform_instance,
616
+ )
617
+ data_flow = MSSQLDataFlow(entity=job)
618
+ yield from self.construct_flow_workunits(data_flow=data_flow)
619
+ yield from self.loop_job_steps(job, job_steps)
620
+
621
+ except Exception as job_error:
622
+ logger.warning(f"Failed to process job {job_name}: {job_error}")
623
+ self.report.warning(
624
+ message=f"Failed to process job {job_name}",
625
+ title="SQL Server Jobs Extraction",
626
+ context="Error occurred while processing individual job",
627
+ exc=job_error,
628
+ )
629
+ continue
630
+
631
+ except Exception as e:
632
+ error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
633
+ logger.error(error_message)
634
+
635
+ # Provide specific guidance for permission issues
636
+ if "permission" in str(e).lower() or "denied" in str(e).lower():
637
+ permission_guidance = (
638
+ "For managed SQL Server services, ensure the following permissions are granted:\n"
639
+ "GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
640
+ "GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
641
+ "For on-premises SQL Server, you may also need:\n"
642
+ "GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
643
+ "GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
385
644
  )
386
- data_flow = MSSQLDataFlow(entity=job)
387
- yield from self.construct_flow_workunits(data_flow=data_flow)
388
- yield from self.loop_job_steps(job, job_steps)
645
+ logger.info(permission_guidance)
646
+
647
+ raise e
389
648
 
390
649
  def loop_job_steps(
391
650
  self, job: MSSQLJob, job_steps: Dict[str, Any]
@@ -405,7 +664,7 @@ class SQLServerSource(SQLAlchemySource):
405
664
  self,
406
665
  inspector: Inspector,
407
666
  schema: str,
408
- sql_config: SQLServerConfig,
667
+ sql_config: SQLServerConfig, # type: ignore
409
668
  ) -> Iterable[MetadataWorkUnit]:
410
669
  """
411
670
  Loop schema data for get stored procedures as dataJob-s.
@@ -714,25 +973,29 @@ class SQLServerSource(SQLAlchemySource):
714
973
  url = self.config.get_sql_alchemy_url()
715
974
  logger.debug(f"sql_alchemy_url={url}")
716
975
  engine = create_engine(url, **self.config.options)
717
- with engine.connect() as conn:
718
- if self.config.database and self.config.database != "":
719
- inspector = inspect(conn)
720
- yield inspector
721
- else:
976
+
977
+ if (
978
+ self.config.database
979
+ and self.config.database != ""
980
+ or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
981
+ ):
982
+ inspector = inspect(engine)
983
+ yield inspector
984
+ else:
985
+ with engine.begin() as conn:
722
986
  databases = conn.execute(
723
987
  "SELECT name FROM master.sys.databases WHERE name NOT IN \
724
988
  ('master', 'model', 'msdb', 'tempdb', 'Resource', \
725
989
  'distribution' , 'reportserver', 'reportservertempdb'); "
726
- )
727
- for db in databases:
728
- if self.config.database_pattern.allowed(db["name"]):
729
- url = self.config.get_sql_alchemy_url(current_db=db["name"])
730
- with create_engine(
731
- url, **self.config.options
732
- ).connect() as conn:
733
- inspector = inspect(conn)
734
- self.current_database = db["name"]
735
- yield inspector
990
+ ).fetchall()
991
+
992
+ for db in databases:
993
+ if self.config.database_pattern.allowed(db["name"]):
994
+ url = self.config.get_sql_alchemy_url(current_db=db["name"])
995
+ engine = create_engine(url, **self.config.options)
996
+ inspector = inspect(engine)
997
+ self.current_database = db["name"]
998
+ yield inspector
736
999
 
737
1000
  def get_identifier(
738
1001
  self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
@@ -763,13 +1026,22 @@ class SQLServerSource(SQLAlchemySource):
763
1026
  yield from auto_workunit(
764
1027
  generate_procedure_lineage(
765
1028
  schema_resolver=self.get_schema_resolver(),
766
- procedure=procedure,
1029
+ procedure=procedure.to_base_procedure(),
767
1030
  procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
768
1031
  is_temp_table=self.is_temp_table,
1032
+ default_db=procedure.db,
1033
+ default_schema=procedure.schema,
769
1034
  )
770
1035
  )
771
1036
 
772
1037
  def is_temp_table(self, name: str) -> bool:
1038
+ if any(
1039
+ re.match(pattern, name, flags=re.IGNORECASE)
1040
+ for pattern in self.config.temporary_tables_pattern
1041
+ ):
1042
+ logger.debug(f"temp table matched by pattern {name}")
1043
+ return True
1044
+
773
1045
  try:
774
1046
  parts = name.split(".")
775
1047
  table_name = parts[-1]
@@ -803,3 +1075,45 @@ class SQLServerSource(SQLAlchemySource):
803
1075
  if self.config.convert_urns_to_lowercase
804
1076
  else table_ref_str
805
1077
  )
1078
+
1079
+ def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
1080
+ for schema in super().get_allowed_schemas(inspector, db_name):
1081
+ if self.config.quote_schemas:
1082
+ yield quoted_name(schema, True)
1083
+ else:
1084
+ yield schema
1085
+
1086
+ def get_db_name(self, inspector: Inspector) -> str:
1087
+ engine = inspector.engine
1088
+
1089
+ try:
1090
+ if (
1091
+ engine
1092
+ and hasattr(engine, "url")
1093
+ and hasattr(engine.url, "database")
1094
+ and engine.url.database
1095
+ ):
1096
+ return str(engine.url.database).strip('"')
1097
+
1098
+ if (
1099
+ engine
1100
+ and hasattr(engine, "url")
1101
+ and hasattr(engine.url, "query")
1102
+ and "odbc_connect" in engine.url.query
1103
+ ):
1104
+ # According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
1105
+ database = re.search(
1106
+ r"DATABASE=([^;]*);",
1107
+ urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
1108
+ flags=re.IGNORECASE,
1109
+ )
1110
+
1111
+ if database and database.group(1):
1112
+ return database.group(1)
1113
+
1114
+ return ""
1115
+
1116
+ except Exception as e:
1117
+ raise RuntimeError(
1118
+ "Unable to get database name from Sqlalchemy inspector"
1119
+ ) from e