acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,943 @@
1
+ from __future__ import annotations
2
+
3
+ import difflib
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Set,
15
+ Union,
16
+ overload,
17
+ )
18
+
19
+ from typing_extensions import assert_never, deprecated
20
+
21
+ import datahub.metadata.schema_classes as models
22
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
23
+ from datahub.errors import SdkUsageError
24
+ from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn, SchemaFieldUrn, Urn
25
+ from datahub.sdk._shared import (
26
+ ChartUrnOrStr,
27
+ DashboardUrnOrStr,
28
+ DatajobUrnOrStr,
29
+ DatasetUrnOrStr,
30
+ )
31
+ from datahub.sdk._utils import DEFAULT_ACTOR_URN
32
+ from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
33
+ from datahub.sdk.search_client import compile_filters
34
+ from datahub.sdk.search_filters import Filter, FilterDsl
35
+ from datahub.specific.chart import ChartPatchBuilder
36
+ from datahub.specific.dashboard import DashboardPatchBuilder
37
+ from datahub.specific.datajob import DataJobPatchBuilder
38
+ from datahub.specific.dataset import DatasetPatchBuilder
39
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
40
+ from datahub.utilities.ordered_set import OrderedSet
41
+ from datahub.utilities.urns.error import InvalidUrnError
42
+
43
+ if TYPE_CHECKING:
44
+ from datahub.sdk.main_client import DataHubClient
45
+
46
+
47
+ _empty_audit_stamp = models.AuditStampClass(
48
+ time=0,
49
+ actor=DEFAULT_ACTOR_URN,
50
+ )
51
+
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ @dataclass
57
+ class LineagePath:
58
+ urn: str
59
+ entity_name: str
60
+ column_name: Optional[str] = None
61
+
62
+
63
+ @dataclass
64
+ class LineageResult:
65
+ urn: str
66
+ type: str
67
+ hops: int
68
+ direction: Literal["upstream", "downstream"]
69
+ platform: Optional[str] = None
70
+ name: Optional[str] = None
71
+ description: Optional[str] = None
72
+ paths: Optional[List[LineagePath]] = None
73
+
74
+
75
+ class LineageClient:
76
+ def __init__(self, client: DataHubClient):
77
+ self._client = client
78
+ self._graph = client._graph
79
+
80
+ def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
81
+ schema_metadata = self._client._graph.get_aspect(
82
+ str(dataset_urn), models.SchemaMetadataClass
83
+ )
84
+ if schema_metadata is None:
85
+ return set()
86
+
87
+ return {field.fieldPath for field in schema_metadata.fields}
88
+
89
+ @classmethod
90
+ def _get_strict_column_lineage(
91
+ cls,
92
+ upstream_fields: Set[str],
93
+ downstream_fields: Set[str],
94
+ ) -> ColumnLineageMapping:
95
+ """Find matches between upstream and downstream fields with case-insensitive matching."""
96
+ strict_column_lineage: ColumnLineageMapping = {}
97
+
98
+ # Create case-insensitive mapping of upstream fields
99
+ case_insensitive_map = {field.lower(): field for field in upstream_fields}
100
+
101
+ # Match downstream fields using case-insensitive comparison
102
+ for downstream_field in downstream_fields:
103
+ lower_field = downstream_field.lower()
104
+ if lower_field in case_insensitive_map:
105
+ # Use the original case of the upstream field
106
+ strict_column_lineage[downstream_field] = [
107
+ case_insensitive_map[lower_field]
108
+ ]
109
+
110
+ return strict_column_lineage
111
+
112
+ @classmethod
113
+ def _get_fuzzy_column_lineage(
114
+ cls,
115
+ upstream_fields: Set[str],
116
+ downstream_fields: Set[str],
117
+ ) -> ColumnLineageMapping:
118
+ """Generate fuzzy matches between upstream and downstream fields."""
119
+
120
+ # Simple normalization function for better matching
121
+ def normalize(s: str) -> str:
122
+ return s.lower().replace("_", "")
123
+
124
+ # Create normalized lookup for upstream fields
125
+ normalized_upstream = {normalize(field): field for field in upstream_fields}
126
+
127
+ fuzzy_column_lineage = {}
128
+ for downstream_field in downstream_fields:
129
+ # Try exact match first
130
+ if downstream_field in upstream_fields:
131
+ fuzzy_column_lineage[downstream_field] = [downstream_field]
132
+ continue
133
+
134
+ # Try normalized match
135
+ norm_downstream = normalize(downstream_field)
136
+ if norm_downstream in normalized_upstream:
137
+ fuzzy_column_lineage[downstream_field] = [
138
+ normalized_upstream[norm_downstream]
139
+ ]
140
+ continue
141
+
142
+ # If no direct match, find closest match using similarity
143
+ matches = difflib.get_close_matches(
144
+ norm_downstream,
145
+ normalized_upstream.keys(),
146
+ n=1, # Return only the best match
147
+ cutoff=0.8, # Adjust cutoff for sensitivity
148
+ )
149
+
150
+ if matches:
151
+ fuzzy_column_lineage[downstream_field] = [
152
+ normalized_upstream[matches[0]]
153
+ ]
154
+
155
+ return fuzzy_column_lineage
156
+
157
+ @overload
158
+ def add_lineage(
159
+ self,
160
+ *,
161
+ upstream: DatasetUrnOrStr,
162
+ downstream: DatasetUrnOrStr,
163
+ column_lineage: Union[
164
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
165
+ ] = False,
166
+ transformation_text: Optional[str] = None,
167
+ ) -> None:
168
+ """Add dataset-to-dataset lineage with column-level mapping."""
169
+
170
+ @overload
171
+ def add_lineage(
172
+ self,
173
+ *,
174
+ upstream: Union[DatajobUrnOrStr],
175
+ downstream: DatasetUrnOrStr,
176
+ ) -> None:
177
+ """Add dataset-to-datajob or dataset-to-mlmodel lineage."""
178
+
179
+ @overload
180
+ def add_lineage(
181
+ self,
182
+ *,
183
+ upstream: Union[DatasetUrnOrStr, DatajobUrnOrStr],
184
+ downstream: DatajobUrnOrStr,
185
+ ) -> None:
186
+ """Add datajob-to-dataset or datajob-to-datajob lineage."""
187
+
188
+ @overload
189
+ def add_lineage(
190
+ self,
191
+ *,
192
+ upstream: Union[DashboardUrnOrStr, DatasetUrnOrStr, ChartUrnOrStr],
193
+ downstream: DashboardUrnOrStr,
194
+ ) -> None:
195
+ """Add dashboard-to-dashboard or dashboard-to-dataset lineage."""
196
+
197
+ @overload
198
+ def add_lineage(
199
+ self,
200
+ *,
201
+ upstream: DatasetUrnOrStr,
202
+ downstream: ChartUrnOrStr,
203
+ ) -> None:
204
+ """Add dataset-to-chart lineage."""
205
+
206
+ # The actual implementation that handles all overloaded cases
207
+ def add_lineage(
208
+ self,
209
+ *,
210
+ upstream: Union[
211
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
212
+ ],
213
+ downstream: Union[
214
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
215
+ ],
216
+ column_lineage: Union[
217
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
218
+ ] = False,
219
+ transformation_text: Optional[str] = None,
220
+ ) -> None:
221
+ """Add lineage between two entities.
222
+
223
+ This flexible method handles different combinations of entity types:
224
+ - dataset to dataset
225
+ - dataset to datajob
226
+ - datajob to dataset
227
+ - datajob to datajob
228
+ - dashboard to dataset
229
+ - dashboard to chart
230
+ - dashboard to dashboard
231
+ - dataset to chart
232
+
233
+ Args:
234
+ upstream: URN of the upstream entity (dataset or datajob)
235
+ downstream: URN of the downstream entity (dataset or datajob)
236
+ column_lineage: Optional boolean to indicate if column-level lineage should be added or a lineage mapping type (auto_fuzzy, auto_strict, or a mapping of column-level lineage)
237
+ transformation_text: Optional SQL query text that defines the transformation
238
+ (only applicable for dataset-to-dataset lineage)
239
+
240
+ Raises:
241
+ InvalidUrnError: If the URNs provided are invalid
242
+ SdkUsageError: If certain parameter combinations are not supported
243
+ """
244
+ # Validate parameter combinations
245
+ upstream_entity_type = Urn.from_string(upstream).entity_type
246
+ downstream_entity_type = Urn.from_string(downstream).entity_type
247
+
248
+ key = (upstream_entity_type, downstream_entity_type)
249
+
250
+ # if it's not dataset-dataset lineage but provided with column_lineage or transformation_text, raise an error
251
+ if key != ("dataset", "dataset") and (column_lineage or transformation_text):
252
+ raise SdkUsageError(
253
+ "Column lineage and query text are only applicable for dataset-to-dataset lineage"
254
+ )
255
+
256
+ lineage_handlers: dict[tuple[str, str], Callable] = {
257
+ ("dataset", "dataset"): self._add_dataset_lineage,
258
+ ("dataset", "dashboard"): self._add_dashboard_lineage,
259
+ ("chart", "dashboard"): self._add_dashboard_lineage,
260
+ ("dashboard", "dashboard"): self._add_dashboard_lineage,
261
+ ("dataset", "dataJob"): self._add_datajob_lineage,
262
+ ("dataJob", "dataJob"): self._add_datajob_lineage,
263
+ ("dataJob", "dataset"): self._add_datajob_output,
264
+ ("dataset", "chart"): self._add_chart_lineage,
265
+ }
266
+
267
+ try:
268
+ lineage_handler = lineage_handlers[key]
269
+ lineage_handler(
270
+ upstream=upstream,
271
+ downstream=downstream,
272
+ upstream_type=upstream_entity_type,
273
+ column_lineage=column_lineage,
274
+ transformation_text=transformation_text,
275
+ )
276
+ except KeyError:
277
+ raise SdkUsageError(
278
+ f"Unsupported entity type combination: {upstream_entity_type} -> {downstream_entity_type}"
279
+ ) from None
280
+
281
+ def _add_dataset_lineage(
282
+ self,
283
+ *,
284
+ upstream,
285
+ downstream,
286
+ column_lineage,
287
+ transformation_text,
288
+ **_,
289
+ ):
290
+ upstream_urn = DatasetUrn.from_string(upstream)
291
+ downstream_urn = DatasetUrn.from_string(downstream)
292
+
293
+ if column_lineage:
294
+ column_lineage = (
295
+ "auto_fuzzy" if column_lineage is True else column_lineage
296
+ ) # if column_lineage is True, set it to auto_fuzzy
297
+ cll = self._process_column_lineage(
298
+ column_lineage, upstream_urn, downstream_urn
299
+ )
300
+ else:
301
+ cll = None
302
+
303
+ if transformation_text:
304
+ self._process_transformation_lineage(
305
+ transformation_text, upstream_urn, downstream_urn, cll
306
+ )
307
+ else:
308
+ updater = DatasetPatchBuilder(str(downstream_urn))
309
+ updater.add_upstream_lineage(
310
+ models.UpstreamClass(
311
+ dataset=str(upstream_urn),
312
+ type=models.DatasetLineageTypeClass.COPY,
313
+ )
314
+ )
315
+ for cl in cll or []:
316
+ updater.add_fine_grained_upstream_lineage(cl)
317
+ self._client.entities.update(updater)
318
+
319
+ def _add_dashboard_lineage(self, *, upstream, downstream, upstream_type, **_):
320
+ patch = DashboardPatchBuilder(str(downstream))
321
+ if upstream_type == "dataset":
322
+ patch.add_dataset_edge(upstream)
323
+ elif upstream_type == "chart":
324
+ patch.add_chart_edge(upstream)
325
+ elif upstream_type == "dashboard":
326
+ patch.add_dashboard(upstream)
327
+ else:
328
+ raise SdkUsageError(
329
+ f"Unsupported entity type combination: {upstream_type} -> dashboard"
330
+ )
331
+ self._client.entities.update(patch)
332
+
333
+ def _add_datajob_lineage(self, *, upstream, downstream, upstream_type, **_):
334
+ patch = DataJobPatchBuilder(str(downstream))
335
+ if upstream_type == "dataset":
336
+ patch.add_input_dataset(upstream)
337
+ elif upstream_type == "dataJob":
338
+ patch.add_input_datajob(upstream)
339
+ else:
340
+ raise SdkUsageError(
341
+ f"Unsupported entity type combination: {upstream_type} -> dataJob"
342
+ )
343
+ self._client.entities.update(patch)
344
+
345
+ def _add_datajob_output(self, *, upstream, downstream, **_):
346
+ patch = DataJobPatchBuilder(str(upstream))
347
+ patch.add_output_dataset(downstream)
348
+ self._client.entities.update(patch)
349
+
350
+ def _add_chart_lineage(self, *, upstream, downstream, **_):
351
+ patch = ChartPatchBuilder(str(downstream))
352
+ patch.add_input_edge(upstream)
353
+ self._client.entities.update(patch)
354
+
355
+ def _process_column_lineage(self, column_lineage, upstream_urn, downstream_urn):
356
+ cll = None
357
+ if column_lineage:
358
+ # Auto column lineage generation
359
+ if column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
360
+ upstream_schema = self._get_fields_from_dataset_urn(upstream_urn)
361
+ downstream_schema = self._get_fields_from_dataset_urn(downstream_urn)
362
+
363
+ # Choose matching strategy
364
+ mapping = (
365
+ self._get_fuzzy_column_lineage(upstream_schema, downstream_schema)
366
+ if column_lineage == "auto_fuzzy"
367
+ else self._get_strict_column_lineage(
368
+ upstream_schema, downstream_schema
369
+ )
370
+ )
371
+ cll = parse_cll_mapping(
372
+ upstream=upstream_urn,
373
+ downstream=downstream_urn,
374
+ cll_mapping=mapping,
375
+ )
376
+ # Explicit column lineage
377
+ elif isinstance(column_lineage, dict):
378
+ cll = parse_cll_mapping(
379
+ upstream=upstream_urn,
380
+ downstream=downstream_urn,
381
+ cll_mapping=column_lineage,
382
+ )
383
+ else:
384
+ assert_never(column_lineage)
385
+ return cll
386
+
387
+ def _process_transformation_lineage(
388
+ self, transformation_text, upstream_urn, downstream_urn, cll
389
+ ):
390
+ fields_involved = OrderedSet([str(upstream_urn), str(downstream_urn)])
391
+ if cll is not None:
392
+ for c in cll:
393
+ for field in c.upstreams or []:
394
+ fields_involved.add(field)
395
+ for field in c.downstreams or []:
396
+ fields_involved.add(field)
397
+
398
+ # Create query URN and entity
399
+ query_urn = QueryUrn(generate_hash(transformation_text)).urn()
400
+ from datahub.sql_parsing.sql_parsing_aggregator import (
401
+ make_query_subjects,
402
+ )
403
+
404
+ query_entity = MetadataChangeProposalWrapper.construct_many(
405
+ query_urn,
406
+ aspects=[
407
+ models.QueryPropertiesClass(
408
+ statement=models.QueryStatementClass(
409
+ value=transformation_text,
410
+ language=models.QueryLanguageClass.SQL,
411
+ ),
412
+ source=models.QuerySourceClass.SYSTEM,
413
+ created=_empty_audit_stamp,
414
+ lastModified=_empty_audit_stamp,
415
+ ),
416
+ make_query_subjects(list(fields_involved)),
417
+ ],
418
+ )
419
+
420
+ # Build dataset update
421
+ updater = DatasetPatchBuilder(str(downstream_urn))
422
+ updater.add_upstream_lineage(
423
+ models.UpstreamClass(
424
+ dataset=str(upstream_urn),
425
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
426
+ query=query_urn,
427
+ )
428
+ )
429
+
430
+ # Add fine-grained lineage
431
+ for cl in cll or []:
432
+ cl.query = query_urn
433
+ updater.add_fine_grained_upstream_lineage(cl)
434
+
435
+ # Check dataset existence
436
+ if not self._client._graph.exists(updater.urn):
437
+ raise SdkUsageError(
438
+ f"Dataset {updater.urn} does not exist, and hence cannot be updated."
439
+ )
440
+
441
+ # Emit metadata change proposals
442
+ mcps: List[
443
+ Union[
444
+ MetadataChangeProposalWrapper,
445
+ models.MetadataChangeProposalClass,
446
+ ]
447
+ ] = list(updater.build())
448
+ if query_entity:
449
+ mcps.extend(query_entity)
450
+ self._client._graph.emit_mcps(mcps)
451
+
452
+ def infer_lineage_from_sql(
453
+ self,
454
+ *,
455
+ query_text: str,
456
+ platform: str,
457
+ platform_instance: Optional[str] = None,
458
+ env: str = "PROD",
459
+ default_db: Optional[str] = None,
460
+ default_schema: Optional[str] = None,
461
+ override_dialect: Optional[str] = None,
462
+ ) -> None:
463
+ """Add lineage by parsing a SQL query."""
464
+ from datahub.sql_parsing.sqlglot_lineage import (
465
+ create_lineage_sql_parsed_result,
466
+ )
467
+
468
+ # Parse the SQL query to extract lineage information
469
+ parsed_result = create_lineage_sql_parsed_result(
470
+ query=query_text,
471
+ default_db=default_db,
472
+ default_schema=default_schema,
473
+ platform=platform,
474
+ platform_instance=platform_instance,
475
+ env=env,
476
+ graph=self._client._graph,
477
+ override_dialect=override_dialect,
478
+ )
479
+
480
+ if parsed_result.debug_info.table_error:
481
+ raise SdkUsageError(
482
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}"
483
+ )
484
+ elif parsed_result.debug_info.column_error:
485
+ logger.warning(
486
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}",
487
+ )
488
+
489
+ if not parsed_result.out_tables:
490
+ raise SdkUsageError(
491
+ "No output tables found in the query. Cannot establish lineage."
492
+ )
493
+
494
+ # Use the first output table as the downstream
495
+ downstream_urn = parsed_result.out_tables[0]
496
+
497
+ # Process all upstream tables found in the query
498
+ for upstream_table in parsed_result.in_tables:
499
+ # Skip self-lineage
500
+ if upstream_table == downstream_urn:
501
+ continue
502
+
503
+ # Extract column-level lineage for this specific upstream table
504
+ column_mapping = {}
505
+ if parsed_result.column_lineage:
506
+ for col_lineage in parsed_result.column_lineage:
507
+ if not (col_lineage.downstream and col_lineage.downstream.column):
508
+ continue
509
+
510
+ # Filter upstreams to only include columns from current upstream table
511
+ upstream_cols = [
512
+ ref.column
513
+ for ref in col_lineage.upstreams
514
+ if ref.table == upstream_table and ref.column
515
+ ]
516
+
517
+ if upstream_cols:
518
+ column_mapping[col_lineage.downstream.column] = upstream_cols
519
+
520
+ # Add lineage, including query text
521
+ self.add_lineage(
522
+ upstream=upstream_table,
523
+ downstream=downstream_urn,
524
+ column_lineage=column_mapping,
525
+ transformation_text=query_text,
526
+ )
527
+
528
+ @deprecated("Use add_lineage instead")
529
+ def add_dataset_copy_lineage(
530
+ self,
531
+ *,
532
+ upstream: DatasetUrnOrStr,
533
+ downstream: DatasetUrnOrStr,
534
+ column_lineage: Union[
535
+ None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
536
+ ] = "auto_fuzzy",
537
+ ) -> None:
538
+ upstream = DatasetUrn.from_string(upstream)
539
+ downstream = DatasetUrn.from_string(downstream)
540
+
541
+ if column_lineage is None:
542
+ cll = None
543
+ elif column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
544
+ upstream_schema = self._get_fields_from_dataset_urn(upstream)
545
+ downstream_schema = self._get_fields_from_dataset_urn(downstream)
546
+ if column_lineage == "auto_fuzzy":
547
+ mapping = self._get_fuzzy_column_lineage(
548
+ upstream_schema, downstream_schema
549
+ )
550
+ else:
551
+ mapping = self._get_strict_column_lineage(
552
+ upstream_schema, downstream_schema
553
+ )
554
+ cll = parse_cll_mapping(
555
+ upstream=upstream,
556
+ downstream=downstream,
557
+ cll_mapping=mapping,
558
+ )
559
+ elif isinstance(column_lineage, dict):
560
+ cll = parse_cll_mapping(
561
+ upstream=upstream,
562
+ downstream=downstream,
563
+ cll_mapping=column_lineage,
564
+ )
565
+ else:
566
+ assert_never(column_lineage)
567
+
568
+ updater = DatasetPatchBuilder(str(downstream))
569
+ updater.add_upstream_lineage(
570
+ models.UpstreamClass(
571
+ dataset=str(upstream),
572
+ type=models.DatasetLineageTypeClass.COPY,
573
+ )
574
+ )
575
+ for cl in cll or []:
576
+ updater.add_fine_grained_upstream_lineage(cl)
577
+
578
+ self._client.entities.update(updater)
579
+
580
+ @deprecated("Use add_lineage instead")
581
+ def add_dataset_transform_lineage(
582
+ self,
583
+ *,
584
+ upstream: DatasetUrnOrStr,
585
+ downstream: DatasetUrnOrStr,
586
+ column_lineage: Optional[ColumnLineageMapping] = None,
587
+ transformation_text: Optional[str] = None,
588
+ ) -> None:
589
+ upstream = DatasetUrn.from_string(upstream)
590
+ downstream = DatasetUrn.from_string(downstream)
591
+
592
+ cll = None
593
+ if column_lineage is not None:
594
+ cll = parse_cll_mapping(
595
+ upstream=upstream,
596
+ downstream=downstream,
597
+ cll_mapping=column_lineage,
598
+ )
599
+
600
+ fields_involved = OrderedSet([str(upstream), str(downstream)])
601
+ if cll is not None:
602
+ for c in cll:
603
+ for field in c.upstreams or []:
604
+ fields_involved.add(field)
605
+ for field in c.downstreams or []:
606
+ fields_involved.add(field)
607
+
608
+ query_urn = None
609
+ query_entity = None
610
+ if transformation_text:
611
+ # Eventually we might want to use our regex-based fingerprinting instead.
612
+ fingerprint = generate_hash(transformation_text)
613
+ query_urn = QueryUrn(fingerprint).urn()
614
+
615
+ from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
616
+
617
+ query_entity = MetadataChangeProposalWrapper.construct_many(
618
+ query_urn,
619
+ aspects=[
620
+ models.QueryPropertiesClass(
621
+ statement=models.QueryStatementClass(
622
+ value=transformation_text,
623
+ language=models.QueryLanguageClass.SQL,
624
+ ),
625
+ source=models.QuerySourceClass.SYSTEM,
626
+ created=_empty_audit_stamp,
627
+ lastModified=_empty_audit_stamp,
628
+ ),
629
+ make_query_subjects(list(fields_involved)),
630
+ ],
631
+ )
632
+
633
+ updater = DatasetPatchBuilder(str(downstream))
634
+ updater.add_upstream_lineage(
635
+ models.UpstreamClass(
636
+ dataset=str(upstream),
637
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
638
+ query=query_urn,
639
+ )
640
+ )
641
+ for cl in cll or []:
642
+ cl.query = query_urn
643
+ updater.add_fine_grained_upstream_lineage(cl)
644
+
645
+ # Throw if the dataset does not exist.
646
+ # We need to manually call .build() instead of reusing client.update()
647
+ # so that we make just one emit_mcps call.
648
+ if not self._client._graph.exists(updater.urn):
649
+ raise SdkUsageError(
650
+ f"Dataset {updater.urn} does not exist, and hence cannot be updated."
651
+ )
652
+
653
+ mcps: List[
654
+ Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
655
+ ] = list(updater.build())
656
+ if query_entity:
657
+ mcps.extend(query_entity)
658
+ self._client._graph.emit_mcps(mcps)
659
+
660
+ @deprecated("Use add_lineage instead")
661
+ def add_datajob_lineage(
662
+ self,
663
+ *,
664
+ datajob: DatajobUrnOrStr,
665
+ upstreams: Optional[List[Union[DatasetUrnOrStr, DatajobUrnOrStr]]] = None,
666
+ downstreams: Optional[List[DatasetUrnOrStr]] = None,
667
+ ) -> None:
668
+ """
669
+ Add lineage between a datajob and datasets/datajobs.
670
+
671
+ Args:
672
+ datajob: The datajob URN to connect lineage with
673
+ upstreams: List of upstream datasets or datajobs that serve as inputs to the datajob
674
+ downstreams: List of downstream datasets that are outputs of the datajob
675
+ """
676
+
677
+ if not upstreams and not downstreams:
678
+ raise SdkUsageError("No upstreams or downstreams provided")
679
+
680
+ datajob_urn = DataJobUrn.from_string(datajob)
681
+
682
+ # Initialize the patch builder for the datajob
683
+ patch_builder = DataJobPatchBuilder(str(datajob_urn))
684
+
685
+ # Process upstream connections (inputs to the datajob)
686
+ if upstreams:
687
+ for upstream in upstreams:
688
+ # try converting to dataset urn
689
+ try:
690
+ dataset_urn = DatasetUrn.from_string(upstream)
691
+ patch_builder.add_input_dataset(dataset_urn)
692
+ except InvalidUrnError:
693
+ # try converting to datajob urn
694
+ datajob_urn = DataJobUrn.from_string(upstream)
695
+ patch_builder.add_input_datajob(datajob_urn)
696
+
697
+ # Process downstream connections (outputs from the datajob)
698
+ if downstreams:
699
+ for downstream in downstreams:
700
+ downstream_urn = DatasetUrn.from_string(downstream)
701
+ patch_builder.add_output_dataset(downstream_urn)
702
+
703
+ # Apply the changes to the entity
704
+ self._client.entities.update(patch_builder)
705
+
706
+ def get_lineage(
707
+ self,
708
+ *,
709
+ source_urn: Union[str, Urn],
710
+ source_column: Optional[str] = None,
711
+ direction: Literal["upstream", "downstream"] = "upstream",
712
+ max_hops: int = 1,
713
+ filter: Optional[Filter] = None,
714
+ count: int = 500,
715
+ ) -> List[LineageResult]:
716
+ """
717
+ Retrieve lineage entities connected to a source entity.
718
+ Args:
719
+ source_urn: Source URN for the lineage search
720
+ source_column: Source column for the lineage search
721
+ direction: Direction of lineage traversal
722
+ max_hops: Maximum number of hops to traverse
723
+ filter: Filters to apply to the lineage search
724
+ count: Maximum number of results to return
725
+
726
+ Returns:
727
+ List of lineage results
728
+
729
+ Raises:
730
+ SdkUsageError for invalid filter values
731
+ """
732
+ # Validate and convert input URN
733
+ source_urn = Urn.from_string(source_urn)
734
+ # Prepare GraphQL query variables with a separate method
735
+ variables = self._process_input_variables(
736
+ source_urn, source_column, filter, direction, max_hops, count
737
+ )
738
+
739
+ return self._execute_lineage_query(variables, direction)
740
+
741
+ def _process_input_variables(
742
+ self,
743
+ source_urn: Urn,
744
+ source_column: Optional[str] = None,
745
+ filters: Optional[Filter] = None,
746
+ direction: Literal["upstream", "downstream"] = "upstream",
747
+ max_hops: int = 1,
748
+ count: int = 500,
749
+ ) -> Dict[str, Any]:
750
+ """
751
+ Process filters and prepare GraphQL query variables for lineage search.
752
+
753
+ Args:
754
+ source_urn: Source URN for the lineage search
755
+ source_column: Source column for the lineage search
756
+ filters: Optional filters to apply
757
+ direction: Direction of lineage traversal
758
+ max_hops: Maximum number of hops to traverse
759
+ count: Maximum number of results to return
760
+
761
+ Returns:
762
+ Dictionary of GraphQL query variables
763
+
764
+ Raises:
765
+ SdkUsageError for invalid filter values
766
+ """
767
+
768
+ # print warning if max_hops is greater than 2
769
+ if max_hops > 2:
770
+ logger.warning(
771
+ """If `max_hops` is more than 2, the search will try to find the full lineage graph.
772
+ By default, only 500 results are shown.
773
+ You can change the `count` to get more or fewer results.
774
+ """
775
+ )
776
+
777
+ # Determine hop values
778
+ max_hop_values = (
779
+ [str(hop) for hop in range(1, max_hops + 1)]
780
+ if max_hops <= 2
781
+ else ["1", "2", "3+"]
782
+ )
783
+
784
+ degree_filter = FilterDsl.custom_filter(
785
+ field="degree",
786
+ condition="EQUAL",
787
+ values=max_hop_values,
788
+ )
789
+
790
+ filters_with_max_hops = (
791
+ FilterDsl.and_(degree_filter, filters)
792
+ if filters is not None
793
+ else degree_filter
794
+ )
795
+
796
+ types, compiled_filters = compile_filters(filters_with_max_hops)
797
+
798
+ # Prepare base variables
799
+ variables: Dict[str, Any] = {
800
+ "input": {
801
+ "urn": str(source_urn),
802
+ "direction": direction.upper(),
803
+ "count": count,
804
+ "types": types,
805
+ "orFilters": compiled_filters,
806
+ }
807
+ }
808
+
809
+ # if column is provided, update the variables to include the schema field urn
810
+ if isinstance(source_urn, SchemaFieldUrn) or source_column:
811
+ variables["input"]["searchFlags"] = {
812
+ "groupingSpec": {
813
+ "groupingCriteria": {
814
+ "baseEntityType": "SCHEMA_FIELD",
815
+ "groupingEntityType": "SCHEMA_FIELD",
816
+ }
817
+ }
818
+ }
819
+ if isinstance(source_urn, SchemaFieldUrn):
820
+ variables["input"]["urn"] = str(source_urn)
821
+ elif source_column:
822
+ variables["input"]["urn"] = str(SchemaFieldUrn(source_urn, source_column))
823
+
824
+ return variables
825
+
826
+ def _execute_lineage_query(
827
+ self,
828
+ variables: Dict[str, Any],
829
+ direction: Literal["upstream", "downstream"],
830
+ ) -> List[LineageResult]:
831
+ """Execute GraphQL query and process results."""
832
+ # Construct GraphQL query with dynamic path query
833
+ graphql_query = """
834
+ query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
835
+ scrollAcrossLineage(input: $input) {
836
+ nextScrollId
837
+ searchResults {
838
+ degree
839
+ entity {
840
+ urn
841
+ type
842
+ ... on Dataset {
843
+ name
844
+ platform {
845
+ name
846
+ }
847
+ properties {
848
+ description
849
+ }
850
+ }
851
+ ... on DataJob {
852
+ jobId
853
+ dataPlatformInstance {
854
+ platform {
855
+ name
856
+ }
857
+ }
858
+ properties {
859
+ name
860
+ description
861
+ }
862
+ }
863
+ }
864
+ paths {
865
+ path {
866
+ urn
867
+ type
868
+ }
869
+ }
870
+ }
871
+ }
872
+ }
873
+ """
874
+
875
+ results: List[LineageResult] = []
876
+
877
+ first_iter = True
878
+ scroll_id: Optional[str] = None
879
+
880
+ while first_iter or scroll_id:
881
+ first_iter = False
882
+
883
+ # Update scroll ID if applicable
884
+ if scroll_id:
885
+ variables["input"]["scrollId"] = scroll_id
886
+
887
+ # Execute GraphQL query
888
+ response = self._graph.execute_graphql(graphql_query, variables=variables)
889
+ data = response["scrollAcrossLineage"]
890
+ scroll_id = data.get("nextScrollId")
891
+
892
+ # Process search results
893
+ for entry in data["searchResults"]:
894
+ entity = entry["entity"]
895
+
896
+ result = self._create_lineage_result(entity, entry, direction)
897
+ results.append(result)
898
+
899
+ return results
900
+
901
+ def _create_lineage_result(
902
+ self,
903
+ entity: Dict[str, Any],
904
+ entry: Dict[str, Any],
905
+ direction: Literal["upstream", "downstream"],
906
+ ) -> LineageResult:
907
+ """Create a LineageResult from entity and entry data."""
908
+ platform = (entity.get("platform") or {}).get("name") or (
909
+ (entity.get("dataPlatformInstance") or {}).get("platform") or {}
910
+ ).get("name")
911
+
912
+ result = LineageResult(
913
+ urn=entity["urn"],
914
+ type=entity["type"],
915
+ hops=entry["degree"],
916
+ direction=direction,
917
+ platform=platform,
918
+ )
919
+
920
+ properties = entity.get("properties", {})
921
+ if properties:
922
+ result.name = properties.get("name", "")
923
+ result.description = properties.get("description", "")
924
+
925
+ result.paths = []
926
+ if "paths" in entry:
927
+ # Process each path in the lineage graph
928
+ for path in entry["paths"]:
929
+ for path_entry in path["path"]:
930
+ # Only include schema fields in the path (exclude other types like Query)
931
+ if path_entry["type"] == "SCHEMA_FIELD":
932
+ schema_field_urn = SchemaFieldUrn.from_string(path_entry["urn"])
933
+ result.paths.append(
934
+ LineagePath(
935
+ urn=path_entry["urn"],
936
+ entity_name=DatasetUrn.from_string(
937
+ schema_field_urn.parent
938
+ ).name,
939
+ column_name=schema_field_urn.field_path,
940
+ )
941
+ )
942
+
943
+ return result