acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
27
27
  from sqlalchemy.sql import sqltypes as types
28
28
  from sqlalchemy.types import TypeDecorator, TypeEngine
29
29
 
30
+ from datahub.configuration.common import AllowDenyPattern
30
31
  from datahub.emitter.mce_builder import (
31
32
  make_data_platform_urn,
32
33
  make_dataplatform_instance_urn,
@@ -45,6 +46,7 @@ from datahub.ingestion.api.source import (
45
46
  TestableSource,
46
47
  TestConnectionReport,
47
48
  )
49
+ from datahub.ingestion.api.source_protocols import MetadataWorkUnitIterable
48
50
  from datahub.ingestion.api.workunit import MetadataWorkUnit
49
51
  from datahub.ingestion.glossary.classification_mixin import (
50
52
  SAMPLE_SIZE_MULTIPLIER,
@@ -54,6 +56,7 @@ from datahub.ingestion.source.common.data_reader import DataReader
54
56
  from datahub.ingestion.source.common.subtypes import (
55
57
  DatasetContainerSubTypes,
56
58
  DatasetSubTypes,
59
+ SourceCapabilityModifier,
57
60
  )
58
61
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
59
62
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
@@ -70,39 +73,47 @@ from datahub.ingestion.source.sql.sql_utils import (
70
73
  from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
71
74
  SqlAlchemyTableDataReader,
72
75
  )
76
+ from datahub.ingestion.source.sql.stored_procedures.base import (
77
+ BaseProcedure,
78
+ generate_procedure_container_workunits,
79
+ generate_procedure_workunits,
80
+ )
73
81
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
74
82
  StaleEntityRemovalHandler,
75
83
  )
76
84
  from datahub.ingestion.source.state.stateful_ingestion_base import (
77
85
  StatefulIngestionSourceBase,
78
86
  )
79
- from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
80
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
81
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
82
- from datahub.metadata.com.linkedin.pegasus2avro.schema import (
87
+ from datahub.metadata.schema_classes import (
83
88
  ArrayTypeClass,
84
89
  BooleanTypeClass,
85
90
  BytesTypeClass,
91
+ DataPlatformInstanceClass,
92
+ DatasetLineageTypeClass,
93
+ DatasetPropertiesClass,
94
+ DatasetSnapshotClass,
86
95
  DateTypeClass,
87
96
  EnumTypeClass,
88
- ForeignKeyConstraint,
89
- MySqlDDL,
97
+ FineGrainedLineageClass,
98
+ FineGrainedLineageDownstreamTypeClass,
99
+ FineGrainedLineageUpstreamTypeClass,
100
+ ForeignKeyConstraintClass,
101
+ GlobalTagsClass,
102
+ MetadataChangeEventClass,
103
+ MySqlDDLClass,
90
104
  NullTypeClass,
91
105
  NumberTypeClass,
92
106
  RecordTypeClass,
93
- SchemaField,
94
- SchemaFieldDataType,
95
- SchemaMetadata,
107
+ SchemaFieldClass,
108
+ SchemaFieldDataTypeClass,
109
+ SchemaMetadataClass,
110
+ StatusClass,
96
111
  StringTypeClass,
97
- TimeTypeClass,
98
- )
99
- from datahub.metadata.schema_classes import (
100
- DataPlatformInstanceClass,
101
- DatasetLineageTypeClass,
102
- DatasetPropertiesClass,
103
- GlobalTagsClass,
104
112
  SubTypesClass,
105
113
  TagAssociationClass,
114
+ TimeTypeClass,
115
+ UpstreamClass,
116
+ UpstreamLineageClass,
106
117
  ViewPropertiesClass,
107
118
  )
108
119
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -112,6 +123,7 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
112
123
  from datahub.utilities.sqlalchemy_type_converter import (
113
124
  get_native_data_type_for_sqlalchemy_type,
114
125
  )
126
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
115
127
 
116
128
  if TYPE_CHECKING:
117
129
  from datahub.ingestion.source.ge_data_profiler import (
@@ -198,7 +210,7 @@ def make_sqlalchemy_type(name: str) -> Type[TypeEngine]:
198
210
 
199
211
  def get_column_type(
200
212
  sql_report: SQLSourceReport, dataset_name: str, column_type: Any
201
- ) -> SchemaFieldDataType:
213
+ ) -> SchemaFieldDataTypeClass:
202
214
  """
203
215
  Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
204
216
  """
@@ -223,7 +235,7 @@ def get_column_type(
223
235
  )
224
236
  TypeClass = NullTypeClass
225
237
 
226
- return SchemaFieldDataType(type=TypeClass())
238
+ return SchemaFieldDataTypeClass(type=TypeClass())
227
239
 
228
240
 
229
241
  def get_schema_metadata(
@@ -232,10 +244,10 @@ def get_schema_metadata(
232
244
  platform: str,
233
245
  columns: List[dict],
234
246
  pk_constraints: Optional[dict] = None,
235
- foreign_keys: Optional[List[ForeignKeyConstraint]] = None,
236
- canonical_schema: Optional[List[SchemaField]] = None,
247
+ foreign_keys: Optional[List[ForeignKeyConstraintClass]] = None,
248
+ canonical_schema: Optional[List[SchemaFieldClass]] = None,
237
249
  simplify_nested_field_paths: bool = False,
238
- ) -> SchemaMetadata:
250
+ ) -> SchemaMetadataClass:
239
251
  if (
240
252
  simplify_nested_field_paths
241
253
  and canonical_schema is not None
@@ -243,12 +255,12 @@ def get_schema_metadata(
243
255
  ):
244
256
  canonical_schema = downgrade_schema_from_v2(canonical_schema)
245
257
 
246
- schema_metadata = SchemaMetadata(
258
+ schema_metadata = SchemaMetadataClass(
247
259
  schemaName=dataset_name,
248
260
  platform=make_data_platform_urn(platform),
249
261
  version=0,
250
262
  hash="",
251
- platformSchema=MySqlDDL(tableSchema=""),
263
+ platformSchema=MySqlDDLClass(tableSchema=""),
252
264
  fields=canonical_schema or [],
253
265
  )
254
266
  if foreign_keys is not None and foreign_keys != []:
@@ -287,6 +299,10 @@ class ProfileMetadata:
287
299
  SourceCapability.CONTAINERS,
288
300
  "Enabled by default",
289
301
  supported=True,
302
+ subtype_modifier=[
303
+ SourceCapabilityModifier.DATABASE,
304
+ SourceCapabilityModifier.SCHEMA,
305
+ ],
290
306
  )
291
307
  @capability(
292
308
  SourceCapability.DESCRIPTIONS,
@@ -298,6 +314,20 @@ class ProfileMetadata:
298
314
  "Enabled by default",
299
315
  supported=True,
300
316
  )
317
+ @capability(
318
+ SourceCapability.LINEAGE_COARSE,
319
+ "Enabled by default to get lineage for views via `include_view_lineage`",
320
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
321
+ )
322
+ @capability(
323
+ SourceCapability.LINEAGE_FINE,
324
+ "Enabled by default to get lineage for views via `include_view_column_lineage`",
325
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
326
+ )
327
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
328
+ @capability(
329
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
330
+ )
301
331
  class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
302
332
  """A Base class for all SQL Sources that use SQLAlchemy to extend"""
303
333
 
@@ -508,6 +538,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
508
538
  if self.config.include_views:
509
539
  yield from self.loop_views(inspector, schema, self.config)
510
540
 
541
+ if getattr(self.config, "include_stored_procedures", False):
542
+ try:
543
+ yield from self.loop_stored_procedures(inspector, schema, self.config)
544
+ except NotImplementedError as e:
545
+ self.report.warning(
546
+ title="Stored procedures not supported",
547
+ message="The current SQL dialect does not support stored procedures.",
548
+ context=f"{database}.{schema}",
549
+ exc=e,
550
+ )
551
+ except Exception as e:
552
+ self.report.failure(
553
+ title="Failed to list stored procedures for schema",
554
+ message="An error occurred while listing procedures for the schema.",
555
+ context=f"{database}.{schema}",
556
+ exc=e,
557
+ )
558
+
511
559
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
512
560
  return [
513
561
  *super().get_workunit_processors(),
@@ -531,19 +579,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
531
579
  self._add_default_options(sql_config)
532
580
 
533
581
  for inspector in self.get_inspectors():
534
- profiler = None
535
- profile_requests: List["GEProfilerRequest"] = []
536
- if sql_config.is_profiling_enabled():
537
- profiler = self.get_profiler_instance(inspector)
538
- try:
539
- self.add_profile_metadata(inspector)
540
- except Exception as e:
541
- self.warn(
542
- logger,
543
- "profile_metadata",
544
- f"Failed to get enrichment data for profile {e}",
545
- )
546
-
547
582
  db_name = self.get_db_name(inspector)
548
583
  yield from self.get_database_level_workunits(
549
584
  inspector=inspector,
@@ -559,17 +594,41 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
559
594
  database=db_name,
560
595
  )
561
596
 
597
+ # Generate workunit for aggregated SQL parsing results
598
+ yield from self._generate_aggregator_workunits()
599
+
600
+ def is_profiling_enabled_internal(self) -> bool:
601
+ return self.config.is_profiling_enabled()
602
+
603
+ def get_profiling_internal(
604
+ self,
605
+ ) -> MetadataWorkUnitIterable:
606
+ sql_config = self.config
607
+ for inspector in self.get_inspectors():
608
+ profiler = None
609
+ profile_requests: List["GEProfilerRequest"] = []
610
+ profiler = self.get_profiler_instance(inspector)
611
+ try:
612
+ self.add_profile_metadata(inspector)
613
+ except Exception as e:
614
+ self.warn(
615
+ logger,
616
+ "profile_metadata",
617
+ f"Failed to get enrichment data for profile {e}",
618
+ )
619
+ db_name = self.get_db_name(inspector)
620
+ for schema in self.get_allowed_schemas(inspector, db_name):
562
621
  if profiler:
563
622
  profile_requests += list(
564
623
  self.loop_profiler_requests(inspector, schema, sql_config)
565
624
  )
566
-
567
625
  if profiler and profile_requests:
568
626
  yield from self.loop_profiler(
569
627
  profile_requests, profiler, platform=self.platform
570
628
  )
571
629
 
572
- # Generate workunit for aggregated SQL parsing results
630
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
631
+ """Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
573
632
  for mcp in self.aggregator.gen_metadata():
574
633
  yield mcp.as_workunit()
575
634
 
@@ -590,7 +649,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
590
649
  schema: str,
591
650
  fk_dict: Dict[str, str],
592
651
  inspector: Inspector,
593
- ) -> ForeignKeyConstraint:
652
+ ) -> ForeignKeyConstraintClass:
594
653
  referred_schema: Optional[str] = fk_dict.get("referred_schema")
595
654
 
596
655
  if not referred_schema:
@@ -617,7 +676,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
617
676
  for f in fk_dict["referred_columns"]
618
677
  ]
619
678
 
620
- return ForeignKeyConstraint(
679
+ return ForeignKeyConstraintClass(
621
680
  fk_dict["name"], foreign_fields, source_fields, foreign_dataset
622
681
  )
623
682
 
@@ -714,7 +773,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
714
773
  self.config.platform_instance,
715
774
  self.config.env,
716
775
  )
717
- dataset_snapshot = DatasetSnapshot(
776
+ dataset_snapshot = DatasetSnapshotClass(
718
777
  urn=dataset_urn,
719
778
  aspects=[StatusClass(removed=False)],
720
779
  )
@@ -742,6 +801,30 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
742
801
  tags=extra_tags,
743
802
  partition_keys=partitions,
744
803
  )
804
+
805
+ if self.config.include_table_location_lineage and location_urn:
806
+ self.aggregator.add_known_lineage_mapping(
807
+ upstream_urn=location_urn,
808
+ downstream_urn=dataset_snapshot.urn,
809
+ lineage_type=DatasetLineageTypeClass.COPY,
810
+ )
811
+ external_upstream_table = UpstreamClass(
812
+ dataset=location_urn,
813
+ type=DatasetLineageTypeClass.COPY,
814
+ )
815
+
816
+ yield MetadataChangeProposalWrapper(
817
+ entityUrn=dataset_snapshot.urn,
818
+ aspect=UpstreamLineageClass(
819
+ upstreams=[external_upstream_table],
820
+ fineGrainedLineages=self.get_fine_grained_lineages(
821
+ dataset_urn=dataset_snapshot.urn,
822
+ upstream_dataset_urn=location_urn,
823
+ schema_fields=schema_fields,
824
+ ),
825
+ ),
826
+ ).as_workunit()
827
+
745
828
  schema_metadata = get_schema_metadata(
746
829
  self.report,
747
830
  dataset_name,
@@ -762,7 +845,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
762
845
  yield from self.add_table_to_schema_container(
763
846
  dataset_urn=dataset_urn, db_name=db_name, schema=schema
764
847
  )
765
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
848
+ mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
766
849
  yield SqlWorkUnit(id=dataset_name, mce=mce)
767
850
  dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
768
851
  if dpi_aspect:
@@ -797,7 +880,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
797
880
  schema: str,
798
881
  table: str,
799
882
  data_reader: Optional[DataReader],
800
- schema_metadata: SchemaMetadata,
883
+ schema_metadata: SchemaMetadataClass,
801
884
  ) -> None:
802
885
  try:
803
886
  if (
@@ -908,7 +991,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
908
991
 
909
992
  def _get_foreign_keys(
910
993
  self, dataset_urn: str, inspector: Inspector, schema: str, table: str
911
- ) -> List[ForeignKeyConstraint]:
994
+ ) -> List[ForeignKeyConstraintClass]:
912
995
  try:
913
996
  foreign_keys = [
914
997
  self.get_foreign_key_metadata(dataset_urn, schema, fk_rec, inspector)
@@ -922,6 +1005,42 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
922
1005
  foreign_keys = []
923
1006
  return foreign_keys
924
1007
 
1008
+ def get_fine_grained_lineages(
1009
+ self,
1010
+ dataset_urn: str,
1011
+ upstream_dataset_urn: str,
1012
+ schema_fields: List[SchemaFieldClass],
1013
+ ) -> Optional[List[FineGrainedLineageClass]]:
1014
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
1015
+
1016
+ for schema_field in schema_fields:
1017
+ try:
1018
+ field_path_v1 = get_simple_field_path_from_v2_field_path(
1019
+ schema_field.fieldPath
1020
+ )
1021
+ fine_grained_lineages.append(
1022
+ FineGrainedLineageClass(
1023
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1024
+ downstreams=[make_schema_field_urn(dataset_urn, field_path_v1)],
1025
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1026
+ upstreams=[
1027
+ make_schema_field_urn(
1028
+ upstream_dataset_urn,
1029
+ get_simple_field_path_from_v2_field_path(
1030
+ schema_field.fieldPath
1031
+ ),
1032
+ )
1033
+ ],
1034
+ )
1035
+ )
1036
+ except Exception as e:
1037
+ logger.warning(
1038
+ f"Error processing field path for {dataset_urn}: {str(e)}"
1039
+ )
1040
+ continue
1041
+
1042
+ return fine_grained_lineages if fine_grained_lineages else None
1043
+
925
1044
  def get_schema_fields(
926
1045
  self,
927
1046
  dataset_name: str,
@@ -930,7 +1049,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
930
1049
  pk_constraints: Optional[dict] = None,
931
1050
  partition_keys: Optional[List[str]] = None,
932
1051
  tags: Optional[Dict[str, List[str]]] = None,
933
- ) -> List[SchemaField]:
1052
+ ) -> List[SchemaFieldClass]:
934
1053
  canonical_schema = []
935
1054
  for column in columns:
936
1055
  column_tags: Optional[List[str]] = None
@@ -955,14 +1074,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
955
1074
  pk_constraints: Optional[dict] = None,
956
1075
  partition_keys: Optional[List[str]] = None,
957
1076
  tags: Optional[List[str]] = None,
958
- ) -> List[SchemaField]:
1077
+ ) -> List[SchemaFieldClass]:
959
1078
  gtc: Optional[GlobalTagsClass] = None
960
1079
  if tags:
961
1080
  tags_str = [make_tag_urn(t) for t in tags]
962
1081
  tags_tac = [TagAssociationClass(t) for t in tags_str]
963
1082
  gtc = GlobalTagsClass(tags_tac)
964
1083
  full_type = column.get("full_type")
965
- field = SchemaField(
1084
+ field = SchemaFieldClass(
966
1085
  fieldPath=column["name"],
967
1086
  type=get_column_type(self.report, dataset_name, column["type"]),
968
1087
  nativeDataType=(
@@ -1092,7 +1211,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1092
1211
  default_schema=default_schema,
1093
1212
  )
1094
1213
 
1095
- dataset_snapshot = DatasetSnapshot(
1214
+ dataset_snapshot = DatasetSnapshotClass(
1096
1215
  urn=dataset_urn,
1097
1216
  aspects=[StatusClass(removed=False)],
1098
1217
  )
@@ -1111,7 +1230,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1111
1230
  dataset_snapshot.aspects.append(dataset_properties)
1112
1231
  if schema_metadata:
1113
1232
  dataset_snapshot.aspects.append(schema_metadata)
1114
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1233
+ mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
1115
1234
  yield SqlWorkUnit(id=dataset_name, mce=mce)
1116
1235
  dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
1117
1236
  if dpi_aspect:
@@ -1350,3 +1469,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1350
1469
 
1351
1470
  def get_report(self):
1352
1471
  return self.report
1472
+
1473
+ def loop_stored_procedures(
1474
+ self,
1475
+ inspector: Inspector,
1476
+ schema: str,
1477
+ config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
1478
+ ) -> Iterable[MetadataWorkUnit]:
1479
+ """
1480
+ Loop schema data for get stored procedures as dataJob-s.
1481
+ """
1482
+ db_name = self.get_db_name(inspector)
1483
+
1484
+ procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
1485
+ if procedures:
1486
+ yield from self._process_procedures(procedures, db_name, schema)
1487
+
1488
+ def fetch_procedures_for_schema(
1489
+ self, inspector: Inspector, schema: str, db_name: str
1490
+ ) -> List[BaseProcedure]:
1491
+ try:
1492
+ raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
1493
+ inspector, schema, db_name
1494
+ )
1495
+ procedures: List[BaseProcedure] = []
1496
+ for procedure in raw_procedures:
1497
+ procedure_qualified_name = self.get_identifier(
1498
+ schema=schema,
1499
+ entity=procedure.name,
1500
+ inspector=inspector,
1501
+ )
1502
+
1503
+ procedure_pattern = getattr(
1504
+ self.config, "procedure_pattern", AllowDenyPattern.allow_all()
1505
+ )
1506
+ if not procedure_pattern.allowed(procedure_qualified_name):
1507
+ self.report.report_dropped(procedure_qualified_name)
1508
+ else:
1509
+ procedures.append(procedure)
1510
+ return procedures
1511
+ except NotImplementedError:
1512
+ raise
1513
+ except Exception as e:
1514
+ self.report.warning(
1515
+ title="Failed to get procedures for schema",
1516
+ message="An error occurred while fetching procedures for the schema.",
1517
+ context=f"{db_name}.{schema}",
1518
+ exc=e,
1519
+ )
1520
+ return []
1521
+
1522
+ def get_procedures_for_schema(
1523
+ self, inspector: Inspector, schema: str, db_name: str
1524
+ ) -> List[BaseProcedure]:
1525
+ raise NotImplementedError(
1526
+ "Subclasses must implement the 'get_procedures_for_schema' method."
1527
+ )
1528
+
1529
+ def _process_procedures(
1530
+ self,
1531
+ procedures: List[BaseProcedure],
1532
+ db_name: str,
1533
+ schema: str,
1534
+ ) -> Iterable[MetadataWorkUnit]:
1535
+ if procedures:
1536
+ yield from generate_procedure_container_workunits(
1537
+ database_key=gen_database_key(
1538
+ database=db_name,
1539
+ platform=self.platform,
1540
+ platform_instance=self.config.platform_instance,
1541
+ env=self.config.env,
1542
+ ),
1543
+ schema_key=gen_schema_key(
1544
+ db_name=db_name,
1545
+ schema=schema,
1546
+ platform=self.platform,
1547
+ platform_instance=self.config.platform_instance,
1548
+ env=self.config.env,
1549
+ ),
1550
+ )
1551
+ for procedure in procedures:
1552
+ yield from self._process_procedure(procedure, schema, db_name)
1553
+
1554
+ def _process_procedure(
1555
+ self,
1556
+ procedure: BaseProcedure,
1557
+ schema: str,
1558
+ db_name: str,
1559
+ ) -> Iterable[MetadataWorkUnit]:
1560
+ try:
1561
+ yield from generate_procedure_workunits(
1562
+ procedure=procedure,
1563
+ database_key=gen_database_key(
1564
+ database=db_name,
1565
+ platform=self.platform,
1566
+ platform_instance=self.config.platform_instance,
1567
+ env=self.config.env,
1568
+ ),
1569
+ schema_key=gen_schema_key(
1570
+ db_name=db_name,
1571
+ schema=schema,
1572
+ platform=self.platform,
1573
+ platform_instance=self.config.platform_instance,
1574
+ env=self.config.env,
1575
+ ),
1576
+ schema_resolver=self.get_schema_resolver(),
1577
+ )
1578
+ except Exception as e:
1579
+ self.report.warning(
1580
+ title="Failed to emit stored procedure",
1581
+ message="An error occurred while emitting stored procedure",
1582
+ context=procedure.name,
1583
+ exc=e,
1584
+ )
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
4
4
 
5
5
  import pydantic
6
6
  from pydantic import Field
7
- from sqlalchemy.engine import URL
8
7
 
9
8
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
9
  from datahub.configuration.source_common import (
@@ -20,6 +19,7 @@ from datahub.ingestion.glossary.classification_mixin import (
20
19
  ClassificationSourceConfigMixin,
21
20
  )
22
21
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
24
24
  StatefulStaleMetadataRemovalConfig,
25
25
  )
@@ -184,36 +184,3 @@ class SQLAlchemyConnectionConfig(ConfigModel):
184
184
 
185
185
  class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig):
186
186
  pass
187
-
188
-
189
- def make_sqlalchemy_uri(
190
- scheme: str,
191
- username: Optional[str],
192
- password: Optional[str],
193
- at: Optional[str],
194
- db: Optional[str],
195
- uri_opts: Optional[Dict[str, Any]] = None,
196
- ) -> str:
197
- host: Optional[str] = None
198
- port: Optional[int] = None
199
- if at:
200
- try:
201
- host, port_str = at.rsplit(":", 1)
202
- port = int(port_str)
203
- except ValueError:
204
- host = at
205
- port = None
206
- if uri_opts:
207
- uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
208
-
209
- return str(
210
- URL.create(
211
- drivername=scheme,
212
- username=username,
213
- password=password,
214
- host=host,
215
- port=port,
216
- database=db,
217
- query=uri_opts or {},
218
- )
219
- )
@@ -57,10 +57,11 @@ class GenericProfiler:
57
57
  platform: Optional[str] = None,
58
58
  profiler_args: Optional[Dict] = None,
59
59
  ) -> Iterable[MetadataWorkUnit]:
60
+ # We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
60
61
  ge_profile_requests: List[GEProfilerRequest] = [
61
62
  cast(GEProfilerRequest, request)
62
63
  for request in requests
63
- if not request.profile_table_level_only
64
+ if not request.profile_table_level_only or request.table.rows_count == 0
64
65
  ]
65
66
  table_level_profile_requests: List[TableProfilerRequest] = [
66
67
  request for request in requests if request.profile_table_level_only
@@ -284,6 +284,8 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
284
284
  "INTEGER": NumberType,
285
285
  "BIGINT": NumberType,
286
286
  "SMALLINT": NumberType,
287
+ "TINYINT": NumberType,
288
+ "BYTEINT": NumberType,
287
289
  "FLOAT": NumberType,
288
290
  "FLOAT4": NumberType,
289
291
  "FLOAT8": NumberType,
@@ -291,6 +293,7 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
291
293
  "DOUBLE PRECISION": NumberType,
292
294
  "REAL": NumberType,
293
295
  "VARCHAR": StringType,
296
+ "CHARACTER VARYING": StringType,
294
297
  "CHAR": StringType,
295
298
  "CHARACTER": StringType,
296
299
  "STRING": StringType,
@@ -313,8 +316,8 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
313
316
 
314
317
 
315
318
  def resolve_snowflake_modified_type(type_string: str) -> Any:
316
- # Match types with precision and scale, e.g., 'DECIMAL(38,0)'
317
- match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
319
+ # Match types with precision and scale, e.g., 'DECIMAL(38,0)' or TIME(3)
320
+ match = re.match(r"([a-z A-Z_]+)\(\d+(,(\s+)?\d+)?\)", type_string)
318
321
  if match:
319
322
  modified_type_base = match.group(1) # Extract the base type
320
323
  return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
@@ -456,6 +459,25 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
456
459
  "uuid": StringType,
457
460
  }
458
461
 
462
+ # Neo4j property types mapping
463
+ # https://neo4j.com/docs/cypher-manual/current/values-and-types/property-structural-constructed/
464
+ NEO4J_TYPES_MAP: Dict[str, Any] = {
465
+ "boolean": BooleanType,
466
+ "date": DateType,
467
+ "duration": TimeType, # Neo4j duration represents a temporal amount
468
+ "float": NumberType,
469
+ "integer": NumberType,
470
+ "list": ArrayType,
471
+ "local_date_time": TimeType,
472
+ "local_time": TimeType,
473
+ "point": StringType, # Neo4j point - spatial coordinate, represented as string
474
+ "string": StringType,
475
+ "zoned_date_time": TimeType,
476
+ "zoned_time": TimeType,
477
+ "node": StringType, # Neo4j object type
478
+ "relationship": StringType, # Neo4j object type
479
+ }
480
+
459
481
 
460
482
  _merged_mapping = {
461
483
  "boolean": BooleanType,
@@ -475,6 +497,7 @@ _merged_mapping = {
475
497
  **TRINO_SQL_TYPES_MAP,
476
498
  **ATHENA_SQL_TYPES_MAP,
477
499
  **VERTICA_SQL_TYPES_MAP,
500
+ **NEO4J_TYPES_MAP,
478
501
  }
479
502
 
480
503
 
@@ -484,6 +507,8 @@ def resolve_sql_type(
484
507
  ) -> Optional[DATAHUB_FIELD_TYPE]:
485
508
  # In theory, we should use the platform-specific mapping where available.
486
509
  # However, the types don't ever conflict, so the merged mapping is fine.
510
+ # Wrong assumption - there ARE conflicts as the test_type_conflicts_across_platforms in test_sql_types.py shows.
511
+ # TODO: revisit this and make platform-specific mappings work.
487
512
  TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
488
513
  _merged_mapping.get(column_type) if column_type else None
489
514
  )