acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
18
18
  from trino.sqlalchemy import datatype
19
19
  from trino.sqlalchemy.dialect import TrinoDialect
20
20
 
21
+ from datahub.configuration.common import HiddenFromDocs
21
22
  from datahub.configuration.source_common import (
22
23
  EnvConfigMixin,
23
24
  PlatformInstanceConfigMixin,
@@ -36,6 +37,7 @@ from datahub.ingestion.api.decorators import (
36
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
38
  from datahub.ingestion.extractor import schema_util
38
39
  from datahub.ingestion.source.common.data_reader import DataReader
40
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
39
41
  from datahub.ingestion.source.sql.sql_common import (
40
42
  SQLAlchemySource,
41
43
  SqlWorkUnit,
@@ -128,24 +130,47 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
128
130
  if catalog_name is None:
129
131
  raise exc.NoSuchTableError("catalog is required in connection")
130
132
  connector_name = get_catalog_connector_name(connection.engine, catalog_name)
131
- if connector_name is None:
132
- return {}
133
- if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
133
+ if (
134
+ connector_name is not None
135
+ and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
136
+ ):
134
137
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
135
138
  query = f"SELECT * FROM {properties_table}"
136
- row = connection.execute(sql.text(query)).fetchone()
139
+ rows = connection.execute(sql.text(query)).fetchall()
137
140
 
138
141
  # Generate properties dictionary.
139
142
  properties = {}
140
- if row:
143
+
144
+ if len(rows) == 0:
145
+ # No properties found, return empty dictionary
146
+ return {}
147
+
148
+ # Check if using the old format (key, value columns)
149
+ if (
150
+ connector_name == "iceberg"
151
+ and len(rows[0]) == 2
152
+ and "key" in rows[0]
153
+ and "value" in rows[0]
154
+ ):
155
+ # https://trino.io/docs/current/connector/iceberg.html#properties-table
156
+ for row in rows:
157
+ if row["value"] is not None:
158
+ properties[row["key"]] = row["value"]
159
+ return {"text": properties.get("comment"), "properties": properties}
160
+ elif connector_name == "hive" and len(rows[0]) > 1 and len(rows) == 1:
161
+ # https://trino.io/docs/current/connector/hive.html#properties-table
162
+ row = rows[0]
141
163
  for col_name, col_value in row.items():
142
164
  if col_value is not None:
143
165
  properties[col_name] = col_value
166
+ return {"text": properties.get("comment"), "properties": properties}
144
167
 
145
- return {"text": properties.get("comment"), "properties": properties}
146
- else:
147
- return self.get_table_comment_default(connection, table_name, schema)
148
- except Exception:
168
+ # If we can't get the properties we still fallback to the default
169
+ return self.get_table_comment_default(connection, table_name, schema)
170
+ except Exception as e:
171
+ logging.warning(
172
+ f"Failed to get table comment for {table_name} in {schema}: {e}"
173
+ )
149
174
  return {}
150
175
 
151
176
 
@@ -198,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
198
223
 
199
224
  class TrinoConfig(BasicSQLAlchemyConfig):
200
225
  # defaults
201
- scheme: str = Field(default="trino", description="", hidden_from_docs=True)
226
+ scheme: HiddenFromDocs[str] = Field(default="trino")
202
227
  database: str = Field(description="database (catalog)")
203
228
 
204
229
  catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
@@ -226,6 +251,14 @@ class TrinoConfig(BasicSQLAlchemyConfig):
226
251
  @support_status(SupportStatus.CERTIFIED)
227
252
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
228
253
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
254
+ @capability(
255
+ SourceCapability.LINEAGE_COARSE,
256
+ "Extract table-level lineage",
257
+ subtype_modifier=[
258
+ SourceCapabilityModifier.TABLE,
259
+ SourceCapabilityModifier.VIEW,
260
+ ],
261
+ )
229
262
  class TrinoSource(SQLAlchemySource):
230
263
  """
231
264
 
@@ -7,19 +7,19 @@ from sqlalchemy import create_engine, inspect
7
7
  from sqlalchemy.engine import URL
8
8
  from sqlalchemy.engine.reflection import Inspector
9
9
 
10
- from datahub.configuration.common import AllowDenyPattern
10
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
11
11
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
12
12
  from datahub.emitter.mcp_builder import ContainerKey
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
14
14
  from datahub.ingestion.source.sql.sql_common import SQLAlchemySource, logger
15
15
  from datahub.ingestion.source.sql.sql_config import (
16
16
  BasicSQLAlchemyConfig,
17
- make_sqlalchemy_uri,
18
17
  )
19
18
  from datahub.ingestion.source.sql.sql_utils import (
20
19
  add_table_to_schema_container,
21
20
  gen_database_key,
22
21
  )
22
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
23
23
 
24
24
 
25
25
  class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
27
27
  default=AllowDenyPattern.allow_all(),
28
28
  description="Regex patterns for databases to filter in ingestion.",
29
29
  )
30
- schema_pattern: AllowDenyPattern = Field(
30
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
31
31
  # The superclass contains a `schema_pattern` field, so we need this here
32
32
  # to override the documentation.
33
33
  default=AllowDenyPattern.allow_all(),
34
- hidden_from_docs=True,
35
34
  description="Deprecated in favour of database_pattern.",
36
35
  )
37
36
 
@@ -4,7 +4,8 @@ from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
5
5
 
6
6
  import pydantic
7
- from pydantic.class_validators import validator
7
+ import pytest
8
+ from pydantic import validator
8
9
  from vertica_sqlalchemy_dialect.base import VerticaInspector
9
10
 
10
11
  from datahub.configuration.common import AllowDenyPattern
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
25
26
  )
26
27
  from datahub.ingestion.api.workunit import MetadataWorkUnit
27
28
  from datahub.ingestion.source.common.data_reader import DataReader
29
+ from datahub.ingestion.source.common.subtypes import (
30
+ DatasetSubTypes,
31
+ SourceCapabilityModifier,
32
+ )
28
33
  from datahub.ingestion.source.sql.sql_common import (
29
34
  SQLAlchemySource,
30
35
  SqlWorkUnit,
@@ -41,7 +46,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
41
46
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
42
47
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
48
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
49
  DatasetLineageTypeClass,
46
50
  DatasetPropertiesClass,
47
51
  SubTypesClass,
@@ -52,6 +56,8 @@ from datahub.utilities import config_clean
52
56
 
53
57
  if TYPE_CHECKING:
54
58
  from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
59
+
60
+ pytestmark = pytest.mark.integration_batch_4
55
61
  logger: logging.Logger = logging.getLogger(__name__)
56
62
 
57
63
 
@@ -113,10 +119,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
113
119
  @capability(
114
120
  SourceCapability.LINEAGE_COARSE,
115
121
  "Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
122
+ subtype_modifier=[
123
+ SourceCapabilityModifier.VIEW,
124
+ SourceCapabilityModifier.PROJECTIONS,
125
+ ],
116
126
  )
117
127
  @capability(
118
128
  SourceCapability.DELETION_DETECTION,
119
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
129
+ "Enabled by default via stateful ingestion",
120
130
  supported=True,
121
131
  )
122
132
  class VerticaSource(SQLAlchemySource):
@@ -493,11 +503,8 @@ class VerticaSource(SQLAlchemySource):
493
503
  if dpi_aspect:
494
504
  yield dpi_aspect
495
505
  yield MetadataChangeProposalWrapper(
496
- entityType="dataset",
497
- changeType=ChangeTypeClass.UPSERT,
498
506
  entityUrn=dataset_urn,
499
- aspectName="subTypes",
500
- aspect=SubTypesClass(typeNames=["Projections"]),
507
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
501
508
  ).as_workunit()
502
509
 
503
510
  if self.config.domain:
@@ -2,21 +2,22 @@ import json
2
2
  import logging
3
3
  import os
4
4
  from dataclasses import dataclass
5
- from datetime import datetime, timezone
5
+ from datetime import datetime
6
6
  from functools import partial
7
- from typing import Iterable, List, Optional, Set
7
+ from typing import ClassVar, Iterable, List, Optional, Union
8
8
 
9
- from pydantic import Field
9
+ from pydantic import BaseModel, Field, validator
10
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
12
+ from datahub.configuration.datetimes import parse_user_datetime
11
13
  from datahub.configuration.source_common import (
12
14
  EnvConfigMixin,
13
15
  PlatformInstanceConfigMixin,
14
16
  )
15
17
  from datahub.emitter.mce_builder import (
16
18
  make_dataset_urn_with_platform_instance,
17
- make_user_urn,
18
19
  )
19
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
20
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
21
  from datahub.ingestion.api.common import PipelineContext
21
22
  from datahub.ingestion.api.decorators import (
22
23
  SupportStatus,
@@ -25,6 +26,10 @@ from datahub.ingestion.api.decorators import (
25
26
  platform_name,
26
27
  support_status,
27
28
  )
29
+ from datahub.ingestion.api.incremental_lineage_helper import (
30
+ IncrementalLineageConfigMixin,
31
+ auto_incremental_lineage,
32
+ )
28
33
  from datahub.ingestion.api.source import (
29
34
  MetadataWorkUnitProcessor,
30
35
  Source,
@@ -35,13 +40,21 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
35
40
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
41
  from datahub.ingestion.graph.client import DataHubGraph
37
42
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
43
+ from datahub.metadata.urns import CorpUserUrn, DatasetUrn
38
44
  from datahub.sql_parsing.schema_resolver import SchemaResolver
39
- from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
45
+ from datahub.sql_parsing.sql_parsing_aggregator import (
46
+ KnownQueryLineageInfo,
47
+ ObservedQuery,
48
+ SqlAggregatorReport,
49
+ SqlParsingAggregator,
50
+ )
40
51
 
41
52
  logger = logging.getLogger(__name__)
42
53
 
43
54
 
44
- class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
55
+ class SqlQueriesSourceConfig(
56
+ PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
57
+ ):
45
58
  query_file: str = Field(description="Path to file to ingest")
46
59
 
47
60
  platform: str = Field(
@@ -53,45 +66,34 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
53
66
  default=BaseUsageConfig(),
54
67
  )
55
68
 
56
- use_schema_resolver: bool = Field(
69
+ use_schema_resolver: HiddenFromDocs[bool] = Field(
70
+ True,
57
71
  description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
58
- default=True,
59
- hidden_from_docs=True,
60
72
  )
61
73
  default_db: Optional[str] = Field(
74
+ None,
62
75
  description="The default database to use for unqualified table names",
63
- default=None,
64
76
  )
65
77
  default_schema: Optional[str] = Field(
78
+ None,
66
79
  description="The default schema to use for unqualified table names",
67
- default=None,
68
80
  )
69
- default_dialect: Optional[str] = Field(
81
+ override_dialect: Optional[str] = Field(
82
+ None,
70
83
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
71
- default=None,
72
84
  )
73
85
 
74
86
 
87
+ @dataclass
75
88
  class SqlQueriesSourceReport(SourceReport):
76
- num_queries_parsed: int = 0
77
- num_table_parse_failures: int = 0
78
- num_column_parse_failures: int = 0
79
-
80
- def compute_stats(self) -> None:
81
- super().compute_stats()
82
- self.table_failure_rate = (
83
- f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
84
- if self.num_queries_parsed
85
- else "0"
86
- )
87
- self.column_failure_rate = (
88
- f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
89
- if self.num_queries_parsed
90
- else "0"
91
- )
89
+ num_entries_processed: int = 0
90
+ num_entries_failed: int = 0
91
+ num_queries_aggregator_failures: int = 0
92
92
 
93
+ sql_aggregator: Optional[SqlAggregatorReport] = None
93
94
 
94
- @platform_name("SQL Queries")
95
+
96
+ @platform_name("SQL Queries", id="sql-queries")
95
97
  @config_class(SqlQueriesSourceConfig)
96
98
  @support_status(SupportStatus.INCUBATING)
97
99
  @capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries")
@@ -107,15 +109,25 @@ class SqlQueriesSource(Source):
107
109
  - user (optional): string - The user who ran the query.
108
110
  This user value will be directly converted into a DataHub user urn.
109
111
  - operation_type (optional): string - Platform-specific operation type, used if the operation type can't be parsed.
112
+ - session_id (optional): string - Session identifier for temporary table resolution across queries.
110
113
  - downstream_tables (optional): string[] - Fallback list of tables that the query writes to,
111
114
  used if the query can't be parsed.
112
115
  - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
113
116
  used if the query can't be parsed.
117
+
118
+ ### Incremental Lineage
119
+ When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
120
+ This allows you to add lineage edges without removing existing ones, which is useful for:
121
+ - Gradually building up lineage from multiple sources
122
+ - Preserving manually curated lineage
123
+ - Avoiding conflicts when multiple ingestion processes target the same datasets
124
+
125
+ Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
126
+ statistics will still be emitted normally.
114
127
  """
115
128
 
116
- urns: Optional[Set[str]]
117
- schema_resolver: SchemaResolver
118
- builder: SqlParsingBuilder
129
+ schema_resolver: Optional[SchemaResolver]
130
+ aggregator: SqlParsingAggregator
119
131
 
120
132
  def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
121
133
  if not ctx.graph:
@@ -128,22 +140,36 @@ class SqlQueriesSource(Source):
128
140
  self.config = config
129
141
  self.report = SqlQueriesSourceReport()
130
142
 
131
- self.builder = SqlParsingBuilder(usage_config=self.config.usage)
132
-
133
143
  if self.config.use_schema_resolver:
144
+ # TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
145
+ # for the given platform, platform instance, and env. Instead this should be configurable:
146
+ # bulk initialization vs lazy on-demand schema fetching.
134
147
  self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
135
148
  platform=self.config.platform,
136
149
  platform_instance=self.config.platform_instance,
137
150
  env=self.config.env,
138
151
  )
139
- self.urns = self.schema_resolver.get_urns()
140
152
  else:
141
- self.schema_resolver = self.graph._make_schema_resolver(
142
- platform=self.config.platform,
143
- platform_instance=self.config.platform_instance,
144
- env=self.config.env,
145
- )
146
- self.urns = None
153
+ self.schema_resolver = None
154
+
155
+ self.aggregator = SqlParsingAggregator(
156
+ platform=self.config.platform,
157
+ platform_instance=self.config.platform_instance,
158
+ env=self.config.env,
159
+ schema_resolver=self.schema_resolver,
160
+ eager_graph_load=False,
161
+ generate_lineage=True, # TODO: make this configurable
162
+ generate_queries=True, # TODO: make this configurable
163
+ generate_query_subject_fields=True, # TODO: make this configurable
164
+ generate_query_usage_statistics=True, # This enables publishing SELECT query entities, otherwise only mutation queries are published
165
+ generate_usage_statistics=True,
166
+ generate_operations=True, # TODO: make this configurable
167
+ usage_config=self.config.usage,
168
+ is_temp_table=None,
169
+ is_allowed_table=None,
170
+ format_queries=False,
171
+ )
172
+ self.report.sql_aggregator = self.aggregator.report
147
173
 
148
174
  @classmethod
149
175
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
@@ -154,100 +180,172 @@ class SqlQueriesSource(Source):
154
180
  return self.report
155
181
 
156
182
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
157
- return [partial(auto_workunit_reporter, self.get_report())]
183
+ return [
184
+ partial(auto_workunit_reporter, self.get_report()),
185
+ partial(
186
+ auto_incremental_lineage,
187
+ self.config.incremental_lineage,
188
+ ),
189
+ ]
158
190
 
159
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
191
+ def get_workunits_internal(
192
+ self,
193
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
160
194
  logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
195
+
196
+ with self.report.new_stage("Collecting queries from file"):
197
+ queries = list(self._parse_query_file())
198
+ logger.info(f"Collected {len(queries)} queries for processing")
199
+
200
+ with self.report.new_stage("Processing queries through SQL parsing aggregator"):
201
+ for query_entry in queries:
202
+ self._add_query_to_aggregator(query_entry)
203
+
204
+ with self.report.new_stage("Generating metadata work units"):
205
+ logger.info("Generating workunits from SQL parsing aggregator")
206
+ yield from self.aggregator.gen_metadata()
207
+
208
+ def _parse_query_file(self) -> Iterable["QueryEntry"]:
209
+ """Parse the query file and yield QueryEntry objects."""
161
210
  with open(self.config.query_file) as f:
162
211
  for line in f:
163
212
  try:
164
213
  query_dict = json.loads(line, strict=False)
165
214
  entry = QueryEntry.create(query_dict, config=self.config)
166
- yield from self._process_query(entry)
215
+ self.report.num_entries_processed += 1
216
+ if self.report.num_entries_processed % 1000 == 0:
217
+ logger.info(
218
+ f"Processed {self.report.num_entries_processed} query entries"
219
+ )
220
+ yield entry
167
221
  except Exception as e:
168
- logger.warning("Error processing query", exc_info=True)
169
- self.report.report_warning("process-query", str(e))
170
-
171
- logger.info("Generating workunits")
172
- yield from self.builder.gen_workunits()
173
-
174
- def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]:
175
- self.report.num_queries_parsed += 1
176
- if self.report.num_queries_parsed % 1000 == 0:
177
- logger.info(f"Parsed {self.report.num_queries_parsed} queries")
222
+ self.report.num_entries_failed += 1
223
+ self.report.warning(
224
+ title="Error processing query",
225
+ message="Query skipped due to parsing error",
226
+ context=line.strip(),
227
+ exc=e,
228
+ )
178
229
 
179
- result = sqlglot_lineage(
180
- sql=entry.query,
181
- schema_resolver=self.schema_resolver,
182
- default_db=self.config.default_db,
183
- default_schema=self.config.default_schema,
184
- default_dialect=self.config.default_dialect,
185
- )
186
- if result.debug_info.table_error:
187
- logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
188
- self.report.num_table_parse_failures += 1
189
- for downstream_urn in set(entry.downstream_tables):
190
- self.builder.add_lineage(
191
- downstream_urn=downstream_urn,
192
- upstream_urns=entry.upstream_tables,
193
- timestamp=entry.timestamp,
194
- user=entry.user,
230
+ def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
231
+ """Add a query to the SQL parsing aggregator."""
232
+ try:
233
+ # If we have both upstream and downstream tables, use explicit lineage
234
+ if query_entry.upstream_tables and query_entry.downstream_tables:
235
+ logger.debug("Using explicit lineage from query file")
236
+ for downstream_table in query_entry.downstream_tables:
237
+ known_lineage = KnownQueryLineageInfo(
238
+ query_text=query_entry.query,
239
+ downstream=str(downstream_table),
240
+ upstreams=[str(urn) for urn in query_entry.upstream_tables],
241
+ timestamp=query_entry.timestamp,
242
+ session_id=query_entry.session_id,
243
+ )
244
+ self.aggregator.add_known_query_lineage(known_lineage)
245
+ else:
246
+ # Warn if only partial lineage information is provided
247
+ # XOR: true if exactly one of upstream_tables or downstream_tables is provided
248
+ if bool(query_entry.upstream_tables) ^ bool(
249
+ query_entry.downstream_tables
250
+ ):
251
+ query_preview = (
252
+ query_entry.query[:150] + "..."
253
+ if len(query_entry.query) > 150
254
+ else query_entry.query
255
+ )
256
+ missing_upstream = (
257
+ "Missing upstream. " if not query_entry.upstream_tables else ""
258
+ )
259
+ missing_downstream = (
260
+ "Missing downstream. "
261
+ if not query_entry.downstream_tables
262
+ else ""
263
+ )
264
+ logger.info(
265
+ f"Only partial lineage information provided, falling back to SQL parsing for complete lineage detection. {missing_upstream}{missing_downstream}Query: {query_preview}"
266
+ )
267
+ # No explicit lineage, rely on parsing
268
+ observed_query = ObservedQuery(
269
+ query=query_entry.query,
270
+ timestamp=query_entry.timestamp,
271
+ user=query_entry.user,
272
+ session_id=query_entry.session_id,
273
+ default_db=self.config.default_db,
274
+ default_schema=self.config.default_schema,
275
+ override_dialect=self.config.override_dialect,
195
276
  )
196
- return
197
- elif result.debug_info.column_error:
198
- logger.debug(
199
- f"Error parsing column lineage, {result.debug_info.column_error}"
277
+ self.aggregator.add_observed_query(observed_query)
278
+
279
+ except Exception as e:
280
+ self.report.num_queries_aggregator_failures += 1
281
+ self.report.warning(
282
+ title="Error adding query to aggregator",
283
+ message="Query skipped due to failure when adding query to SQL parsing aggregator",
284
+ context=query_entry.query,
285
+ exc=e,
200
286
  )
201
- self.report.num_column_parse_failures += 1
202
-
203
- yield from self.builder.process_sql_parsing_result(
204
- result,
205
- query=entry.query,
206
- query_timestamp=entry.timestamp,
207
- user=entry.user,
208
- custom_operation_type=entry.operation_type,
209
- include_urns=self.urns,
210
- )
211
287
 
212
288
 
213
- @dataclass
214
- class QueryEntry:
289
+ class QueryEntry(BaseModel):
215
290
  query: str
216
- timestamp: Optional[datetime]
217
- user: Optional[str]
218
- operation_type: Optional[str]
219
- downstream_tables: List[str]
220
- upstream_tables: List[str]
291
+ timestamp: Optional[datetime] = None
292
+ user: Optional[CorpUserUrn] = None
293
+ operation_type: Optional[str] = None
294
+ downstream_tables: List[DatasetUrn] = Field(default_factory=list)
295
+ upstream_tables: List[DatasetUrn] = Field(default_factory=list)
296
+ session_id: Optional[str] = None
297
+
298
+ # Validation context for URN creation
299
+ _validation_context: ClassVar[Optional[SqlQueriesSourceConfig]] = None
300
+
301
+ class Config:
302
+ arbitrary_types_allowed = True
303
+
304
+ @validator("timestamp", pre=True)
305
+ def parse_timestamp(cls, v):
306
+ return None if v is None else parse_user_datetime(str(v))
307
+
308
+ @validator("user", pre=True)
309
+ def parse_user(cls, v):
310
+ if v is None:
311
+ return None
312
+
313
+ return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
314
+
315
+ @validator("downstream_tables", "upstream_tables", pre=True)
316
+ def parse_tables(cls, v):
317
+ if not v:
318
+ return []
319
+
320
+ result = []
321
+ for item in v:
322
+ if isinstance(item, DatasetUrn):
323
+ result.append(item)
324
+ elif isinstance(item, str):
325
+ # Skip empty/whitespace-only strings
326
+ if item and item.strip():
327
+ # Convert to URN using validation context
328
+ assert cls._validation_context, (
329
+ "Validation context must be set for URN creation"
330
+ )
331
+ urn_string = make_dataset_urn_with_platform_instance(
332
+ name=item,
333
+ platform=cls._validation_context.platform,
334
+ platform_instance=cls._validation_context.platform_instance,
335
+ env=cls._validation_context.env,
336
+ )
337
+ result.append(DatasetUrn.from_string(urn_string))
338
+
339
+ return result
221
340
 
222
341
  @classmethod
223
342
  def create(
224
343
  cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
225
344
  ) -> "QueryEntry":
226
- return cls(
227
- query=entry_dict["query"],
228
- timestamp=(
229
- datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc)
230
- if "timestamp" in entry_dict
231
- else None
232
- ),
233
- user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None,
234
- operation_type=entry_dict.get("operation_type"),
235
- downstream_tables=[
236
- make_dataset_urn_with_platform_instance(
237
- name=table,
238
- platform=config.platform,
239
- platform_instance=config.platform_instance,
240
- env=config.env,
241
- )
242
- for table in entry_dict.get("downstream_tables", [])
243
- ],
244
- upstream_tables=[
245
- make_dataset_urn_with_platform_instance(
246
- name=table,
247
- platform=config.platform,
248
- platform_instance=config.platform_instance,
249
- env=config.env,
250
- )
251
- for table in entry_dict.get("upstream_tables", [])
252
- ],
253
- )
345
+ """Create QueryEntry from dict with config context."""
346
+ # Set validation context for URN creation
347
+ cls._validation_context = config
348
+ try:
349
+ return cls.parse_obj(entry_dict)
350
+ finally:
351
+ cls._validation_context = None