acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ import enum
4
4
  import functools
5
5
  import json
6
6
  import logging
7
- import os
8
7
  import pathlib
9
8
  import tempfile
10
9
  import uuid
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
14
13
 
15
14
  import datahub.emitter.mce_builder as builder
16
15
  import datahub.metadata.schema_classes as models
16
+ from datahub.configuration.env_vars import get_sql_agg_query_log
17
17
  from datahub.configuration.time_window_config import get_time_bucket
18
18
  from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
19
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
- from datahub.emitter.sql_parsing_builder import compute_upstream_fields
21
20
  from datahub.ingestion.api.closeable import Closeable
22
21
  from datahub.ingestion.api.report import Report
23
22
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -30,7 +29,9 @@ from datahub.metadata.urns import (
30
29
  DatasetUrn,
31
30
  QueryUrn,
32
31
  SchemaFieldUrn,
32
+ Urn,
33
33
  )
34
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
34
35
  from datahub.sql_parsing.schema_resolver import (
35
36
  SchemaResolver,
36
37
  SchemaResolverInterface,
@@ -47,8 +48,8 @@ from datahub.sql_parsing.sqlglot_lineage import (
47
48
  sqlglot_lineage,
48
49
  )
49
50
  from datahub.sql_parsing.sqlglot_utils import (
51
+ DialectOrStr,
50
52
  _parse_statement,
51
- generate_hash,
52
53
  get_query_fingerprint,
53
54
  try_format_query,
54
55
  )
@@ -57,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
57
58
  ToolMetaExtractorReport,
58
59
  )
59
60
  from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
61
+ from datahub.utilities.dedup_list import deduplicate_list
60
62
  from datahub.utilities.file_backed_collections import (
61
63
  ConnectionWrapper,
62
64
  FileBackedDict,
@@ -81,7 +83,7 @@ class QueryLogSetting(enum.Enum):
81
83
  _DEFAULT_USER_URN = CorpUserUrn("_ingestion")
82
84
  _MISSING_SESSION_ID = "__MISSING_SESSION_ID"
83
85
  _DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
84
- os.getenv("DATAHUB_SQL_AGG_QUERY_LOG") or QueryLogSetting.DISABLED.name
86
+ get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
85
87
  ]
86
88
  MAX_UPSTREAM_TABLES_COUNT = 300
87
89
  MAX_FINEGRAINEDLINEAGE_COUNT = 2000
@@ -107,8 +109,9 @@ class ObservedQuery:
107
109
  default_schema: Optional[str] = None
108
110
  query_hash: Optional[str] = None
109
111
  usage_multiplier: int = 1
112
+ override_dialect: Optional[DialectOrStr] = None
110
113
 
111
- # Use this to store addtitional key-value information about query for debugging
114
+ # Use this to store additional key-value information about the query for debugging.
112
115
  extra_info: Optional[dict] = None
113
116
 
114
117
 
@@ -139,6 +142,9 @@ class QueryMetadata:
139
142
 
140
143
  used_temp_tables: bool = True
141
144
 
145
+ extra_info: Optional[dict] = None
146
+ origin: Optional[Urn] = None
147
+
142
148
  def make_created_audit_stamp(self) -> models.AuditStampClass:
143
149
  return models.AuditStampClass(
144
150
  time=make_ts_millis(self.latest_timestamp) or 0,
@@ -152,6 +158,48 @@ class QueryMetadata:
152
158
  actor=(self.actor or _DEFAULT_USER_URN).urn(),
153
159
  )
154
160
 
161
+ def get_subjects(
162
+ self,
163
+ downstream_urn: Optional[str],
164
+ include_fields: bool,
165
+ ) -> List[UrnStr]:
166
+ query_subject_urns = OrderedSet[UrnStr]()
167
+ for upstream in self.upstreams:
168
+ query_subject_urns.add(upstream)
169
+ if include_fields:
170
+ for column in sorted(self.column_usage.get(upstream, [])):
171
+ query_subject_urns.add(
172
+ builder.make_schema_field_urn(upstream, column)
173
+ )
174
+ if downstream_urn:
175
+ query_subject_urns.add(downstream_urn)
176
+ if include_fields:
177
+ for column_lineage in self.column_lineage:
178
+ query_subject_urns.add(
179
+ builder.make_schema_field_urn(
180
+ downstream_urn, column_lineage.downstream.column
181
+ )
182
+ )
183
+ return list(query_subject_urns)
184
+
185
+ def make_query_properties(self) -> models.QueryPropertiesClass:
186
+ return models.QueryPropertiesClass(
187
+ statement=models.QueryStatementClass(
188
+ value=self.formatted_query_string,
189
+ language=models.QueryLanguageClass.SQL,
190
+ ),
191
+ source=models.QuerySourceClass.SYSTEM,
192
+ created=self.make_created_audit_stamp(),
193
+ lastModified=self.make_last_modified_audit_stamp(),
194
+ origin=self.origin.urn() if self.origin else None,
195
+ )
196
+
197
+
198
+ def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
199
+ return models.QuerySubjectsClass(
200
+ subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
201
+ )
202
+
155
203
 
156
204
  @dataclasses.dataclass
157
205
  class KnownQueryLineageInfo:
@@ -219,8 +267,9 @@ class PreparsedQuery:
219
267
  query_type_props: QueryTypeProps = dataclasses.field(
220
268
  default_factory=lambda: QueryTypeProps()
221
269
  )
222
- # Use this to store addtitional key-value information about query for debugging
270
+ # Use this to store additional key-value information about the query for debugging.
223
271
  extra_info: Optional[dict] = None
272
+ origin: Optional[Urn] = None
224
273
 
225
274
 
226
275
  @dataclasses.dataclass
@@ -584,6 +633,9 @@ class SqlParsingAggregator(Closeable):
584
633
  TableSwap,
585
634
  ],
586
635
  ) -> None:
636
+ """
637
+ This assumes that queries come in order of increasing timestamps.
638
+ """
587
639
  if isinstance(item, KnownQueryLineageInfo):
588
640
  self.add_known_query_lineage(item)
589
641
  elif isinstance(item, KnownLineageMapping):
@@ -786,6 +838,7 @@ class SqlParsingAggregator(Closeable):
786
838
  session_id=session_id,
787
839
  timestamp=observed.timestamp,
788
840
  user=observed.user,
841
+ override_dialect=observed.override_dialect,
789
842
  )
790
843
  if parsed.debug_info.error:
791
844
  self.report.observed_query_parse_failures.append(
@@ -814,7 +867,7 @@ class SqlParsingAggregator(Closeable):
814
867
  downstream=parsed.out_tables[0] if parsed.out_tables else None,
815
868
  column_lineage=parsed.column_lineage,
816
869
  # TODO: We need a full list of columns referenced, not just the out tables.
817
- column_usage=compute_upstream_fields(parsed),
870
+ column_usage=self._compute_upstream_fields(parsed),
818
871
  inferred_schema=infer_output_schema(parsed),
819
872
  confidence_score=parsed.debug_info.confidence,
820
873
  extra_info=observed.extra_info,
@@ -903,6 +956,8 @@ class SqlParsingAggregator(Closeable):
903
956
  column_usage=parsed.column_usage or {},
904
957
  confidence_score=parsed.confidence_score,
905
958
  used_temp_tables=session_has_temp_tables,
959
+ extra_info=parsed.extra_info,
960
+ origin=parsed.origin,
906
961
  )
907
962
  )
908
963
 
@@ -1101,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
1101
1156
  actor=None,
1102
1157
  upstreams=parsed.in_tables,
1103
1158
  column_lineage=parsed.column_lineage or [],
1104
- column_usage=compute_upstream_fields(parsed),
1159
+ column_usage=self._compute_upstream_fields(parsed),
1105
1160
  confidence_score=parsed.debug_info.confidence,
1106
1161
  )
1107
1162
  )
@@ -1118,6 +1173,7 @@ class SqlParsingAggregator(Closeable):
1118
1173
  session_id: str = _MISSING_SESSION_ID,
1119
1174
  timestamp: Optional[datetime] = None,
1120
1175
  user: Optional[Union[CorpUserUrn, CorpGroupUrn]] = None,
1176
+ override_dialect: Optional[DialectOrStr] = None,
1121
1177
  ) -> SqlParsingResult:
1122
1178
  with self.report.sql_parsing_timer:
1123
1179
  parsed = sqlglot_lineage(
@@ -1125,6 +1181,7 @@ class SqlParsingAggregator(Closeable):
1125
1181
  schema_resolver=schema_resolver,
1126
1182
  default_db=default_db,
1127
1183
  default_schema=default_schema,
1184
+ override_dialect=override_dialect,
1128
1185
  )
1129
1186
  self.report.num_sql_parsed += 1
1130
1187
 
@@ -1283,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
1283
1340
  upstreams.setdefault(upstream, query.query_id)
1284
1341
 
1285
1342
  for lineage_info in query.column_lineage:
1286
- for upstream_ref in lineage_info.upstreams:
1287
- cll[lineage_info.downstream.column].setdefault(
1288
- SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1289
- query.query_id,
1343
+ if (
1344
+ not lineage_info.downstream.column
1345
+ or not lineage_info.downstream.column.strip()
1346
+ ):
1347
+ logger.debug(
1348
+ f"Skipping lineage entry with empty downstream column in query {query.query_id}"
1290
1349
  )
1350
+ continue
1351
+
1352
+ for upstream_ref in lineage_info.upstreams:
1353
+ if upstream_ref.column and upstream_ref.column.strip():
1354
+ cll[lineage_info.downstream.column].setdefault(
1355
+ SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1356
+ query.query_id,
1357
+ )
1358
+ else:
1359
+ logger.debug(
1360
+ f"Skipping empty column reference in lineage for query {query.query_id}"
1361
+ )
1291
1362
 
1292
1363
  # Finally, we can build our lineage edge.
1293
1364
  required_queries = OrderedSet[QueryId]()
@@ -1320,6 +1391,13 @@ class SqlParsingAggregator(Closeable):
1320
1391
  ):
1321
1392
  upstream_columns = [x[0] for x in upstream_columns_for_query]
1322
1393
  required_queries.add(query_id)
1394
+ query = queries_map[query_id]
1395
+
1396
+ column_logic = None
1397
+ for lineage_info in query.column_lineage:
1398
+ if lineage_info.downstream.column == downstream_column:
1399
+ column_logic = lineage_info.logic
1400
+ break
1323
1401
 
1324
1402
  upstream_aspect.fineGrainedLineages.append(
1325
1403
  models.FineGrainedLineageClass(
@@ -1337,7 +1415,16 @@ class SqlParsingAggregator(Closeable):
1337
1415
  if self.can_generate_query(query_id)
1338
1416
  else None
1339
1417
  ),
1340
- confidenceScore=queries_map[query_id].confidence_score,
1418
+ confidenceScore=query.confidence_score,
1419
+ transformOperation=(
1420
+ (
1421
+ f"COPY: {column_logic.column_logic}"
1422
+ if column_logic.is_direct_copy
1423
+ else f"SQL: {column_logic.column_logic}"
1424
+ )
1425
+ if column_logic
1426
+ else None
1427
+ ),
1341
1428
  )
1342
1429
  )
1343
1430
 
@@ -1429,47 +1516,21 @@ class SqlParsingAggregator(Closeable):
1429
1516
  return
1430
1517
 
1431
1518
  # If a query doesn't involve any allowed tables, skip it.
1432
- if downstream_urn is None and not any(
1433
- self.is_allowed_table(urn) for urn in query.upstreams
1434
- ):
1519
+ if (
1520
+ downstream_urn is None or not self.is_allowed_table(downstream_urn)
1521
+ ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
1435
1522
  self.report.num_queries_skipped_due_to_filters += 1
1436
1523
  return
1437
1524
 
1438
- query_subject_urns = OrderedSet[UrnStr]()
1439
- for upstream in query.upstreams:
1440
- query_subject_urns.add(upstream)
1441
- if self.generate_query_subject_fields:
1442
- for column in sorted(query.column_usage.get(upstream, [])):
1443
- query_subject_urns.add(
1444
- builder.make_schema_field_urn(upstream, column)
1445
- )
1446
- if downstream_urn:
1447
- query_subject_urns.add(downstream_urn)
1448
- if self.generate_query_subject_fields:
1449
- for column_lineage in query.column_lineage:
1450
- query_subject_urns.add(
1451
- builder.make_schema_field_urn(
1452
- downstream_urn, column_lineage.downstream.column
1453
- )
1454
- )
1455
-
1456
1525
  yield from MetadataChangeProposalWrapper.construct_many(
1457
1526
  entityUrn=self._query_urn(query_id),
1458
1527
  aspects=[
1459
- models.QueryPropertiesClass(
1460
- statement=models.QueryStatementClass(
1461
- value=query.formatted_query_string,
1462
- language=models.QueryLanguageClass.SQL,
1463
- ),
1464
- source=models.QuerySourceClass.SYSTEM,
1465
- created=query.make_created_audit_stamp(),
1466
- lastModified=query.make_last_modified_audit_stamp(),
1467
- ),
1468
- models.QuerySubjectsClass(
1469
- subjects=[
1470
- models.QuerySubjectClass(entity=urn)
1471
- for urn in query_subject_urns
1472
- ]
1528
+ query.make_query_properties(),
1529
+ make_query_subjects(
1530
+ query.get_subjects(
1531
+ downstream_urn=downstream_urn,
1532
+ include_fields=self.generate_query_subject_fields,
1533
+ )
1473
1534
  ),
1474
1535
  models.DataPlatformInstanceClass(
1475
1536
  platform=self.platform.urn(),
@@ -1538,27 +1599,33 @@ class SqlParsingAggregator(Closeable):
1538
1599
 
1539
1600
  @dataclasses.dataclass
1540
1601
  class QueryLineageInfo:
1541
- upstreams: List[UrnStr] # this is direct upstreams, with *no temp tables*
1542
- column_lineage: List[ColumnLineageInfo]
1602
+ upstreams: OrderedSet[
1603
+ UrnStr
1604
+ ] # this is direct upstreams, with *no temp tables*
1605
+ column_lineage: OrderedSet[ColumnLineageInfo]
1543
1606
  confidence_score: float
1544
1607
 
1545
1608
  def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
1546
- self.upstreams += other_query.upstreams
1547
- self.column_lineage += other_query.column_lineage
1609
+ self.upstreams.update(other_query.upstreams)
1610
+ self.column_lineage.update(other_query.column_lineage)
1548
1611
  self.confidence_score = min(
1549
1612
  self.confidence_score, other_query.confidence_score
1550
1613
  )
1551
1614
 
1615
+ cache: Dict[str, QueryLineageInfo] = {}
1616
+
1552
1617
  def _recurse_into_query(
1553
1618
  query: QueryMetadata, recursion_path: List[QueryId]
1554
1619
  ) -> QueryLineageInfo:
1555
1620
  if query.query_id in recursion_path:
1556
1621
  # This is a cycle, so we just return the query as-is.
1557
1622
  return QueryLineageInfo(
1558
- upstreams=query.upstreams,
1559
- column_lineage=query.column_lineage,
1623
+ upstreams=OrderedSet(query.upstreams),
1624
+ column_lineage=OrderedSet(query.column_lineage),
1560
1625
  confidence_score=query.confidence_score,
1561
1626
  )
1627
+ if query.query_id in cache:
1628
+ return cache[query.query_id]
1562
1629
  recursion_path = [*recursion_path, query.query_id]
1563
1630
  composed_of_queries.add(query.query_id)
1564
1631
 
@@ -1573,7 +1640,7 @@ class SqlParsingAggregator(Closeable):
1573
1640
  upstream_query = self._query_map.get(upstream_query_id)
1574
1641
  if (
1575
1642
  upstream_query
1576
- and upstream_query.query_id not in composed_of_queries
1643
+ and upstream_query.query_id not in recursion_path
1577
1644
  ):
1578
1645
  temp_query_lineage_info = _recurse_into_query(
1579
1646
  upstream_query, recursion_path
@@ -1633,11 +1700,14 @@ class SqlParsingAggregator(Closeable):
1633
1700
  ]
1634
1701
  )
1635
1702
 
1636
- return QueryLineageInfo(
1637
- upstreams=list(new_upstreams),
1638
- column_lineage=new_cll,
1703
+ ret = QueryLineageInfo(
1704
+ upstreams=new_upstreams,
1705
+ column_lineage=OrderedSet(new_cll),
1639
1706
  confidence_score=new_confidence_score,
1640
1707
  )
1708
+ cache[query.query_id] = ret
1709
+
1710
+ return ret
1641
1711
 
1642
1712
  resolved_lineage_info = _recurse_into_query(base_query, [])
1643
1713
 
@@ -1670,20 +1740,30 @@ class SqlParsingAggregator(Closeable):
1670
1740
  )
1671
1741
 
1672
1742
  merged_query_text = ";\n\n".join(
1673
- [q.formatted_query_string for q in ordered_queries]
1743
+ deduplicate_list([q.formatted_query_string for q in ordered_queries])
1674
1744
  )
1675
1745
 
1676
1746
  resolved_query = dataclasses.replace(
1677
1747
  base_query,
1678
1748
  query_id=composite_query_id,
1679
1749
  formatted_query_string=merged_query_text,
1680
- upstreams=resolved_lineage_info.upstreams,
1681
- column_lineage=resolved_lineage_info.column_lineage,
1750
+ upstreams=list(resolved_lineage_info.upstreams),
1751
+ column_lineage=list(resolved_lineage_info.column_lineage),
1682
1752
  confidence_score=resolved_lineage_info.confidence_score,
1683
1753
  )
1684
1754
 
1685
1755
  return resolved_query
1686
1756
 
1757
+ @staticmethod
1758
+ def _compute_upstream_fields(
1759
+ result: SqlParsingResult,
1760
+ ) -> Dict[UrnStr, Set[UrnStr]]:
1761
+ upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
1762
+ for cl in result.column_lineage or []:
1763
+ for upstream in cl.upstreams:
1764
+ upstream_fields[upstream.table].add(upstream.column)
1765
+ return upstream_fields
1766
+
1687
1767
  def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
1688
1768
  if not self._usage_aggregator:
1689
1769
  return
@@ -1733,8 +1813,9 @@ class SqlParsingAggregator(Closeable):
1733
1813
  operationType=operation_type,
1734
1814
  lastUpdatedTimestamp=make_ts_millis(query.latest_timestamp),
1735
1815
  actor=query.actor.urn() if query.actor else None,
1736
- customProperties=(
1737
- {"query_urn": self._query_urn(query_id)}
1816
+ sourceType=models.OperationSourceTypeClass.DATA_PLATFORM,
1817
+ queries=(
1818
+ [self._query_urn(query_id)]
1738
1819
  if self.can_generate_query(query_id)
1739
1820
  else None
1740
1821
  ),