acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,38 @@ from __future__ import annotations
2
2
 
3
3
  import difflib
4
4
  import logging
5
- from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Set,
15
+ Union,
16
+ overload,
17
+ )
6
18
 
7
- from typing_extensions import assert_never
19
+ from typing_extensions import assert_never, deprecated
8
20
 
9
21
  import datahub.metadata.schema_classes as models
10
22
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
23
  from datahub.errors import SdkUsageError
12
- from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn
13
- from datahub.sdk._shared import DatajobUrnOrStr, DatasetUrnOrStr
24
+ from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn, SchemaFieldUrn, Urn
25
+ from datahub.sdk._shared import (
26
+ ChartUrnOrStr,
27
+ DashboardUrnOrStr,
28
+ DatajobUrnOrStr,
29
+ DatasetUrnOrStr,
30
+ )
14
31
  from datahub.sdk._utils import DEFAULT_ACTOR_URN
15
32
  from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
33
+ from datahub.sdk.search_client import compile_filters
34
+ from datahub.sdk.search_filters import Filter, FilterDsl
35
+ from datahub.specific.chart import ChartPatchBuilder
36
+ from datahub.specific.dashboard import DashboardPatchBuilder
16
37
  from datahub.specific.datajob import DataJobPatchBuilder
17
38
  from datahub.specific.dataset import DatasetPatchBuilder
18
39
  from datahub.sql_parsing.fingerprint_utils import generate_hash
@@ -32,9 +53,29 @@ _empty_audit_stamp = models.AuditStampClass(
32
53
  logger = logging.getLogger(__name__)
33
54
 
34
55
 
56
+ @dataclass
57
+ class LineagePath:
58
+ urn: str
59
+ entity_name: str
60
+ column_name: Optional[str] = None
61
+
62
+
63
+ @dataclass
64
+ class LineageResult:
65
+ urn: str
66
+ type: str
67
+ hops: int
68
+ direction: Literal["upstream", "downstream"]
69
+ platform: Optional[str] = None
70
+ name: Optional[str] = None
71
+ description: Optional[str] = None
72
+ paths: Optional[List[LineagePath]] = None
73
+
74
+
35
75
  class LineageClient:
36
76
  def __init__(self, client: DataHubClient):
37
77
  self._client = client
78
+ self._graph = client._graph
38
79
 
39
80
  def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
40
81
  schema_metadata = self._client._graph.get_aspect(
@@ -113,6 +154,378 @@ class LineageClient:
113
154
 
114
155
  return fuzzy_column_lineage
115
156
 
157
+ @overload
158
+ def add_lineage(
159
+ self,
160
+ *,
161
+ upstream: DatasetUrnOrStr,
162
+ downstream: DatasetUrnOrStr,
163
+ column_lineage: Union[
164
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
165
+ ] = False,
166
+ transformation_text: Optional[str] = None,
167
+ ) -> None:
168
+ """Add dataset-to-dataset lineage with column-level mapping."""
169
+
170
+ @overload
171
+ def add_lineage(
172
+ self,
173
+ *,
174
+ upstream: Union[DatajobUrnOrStr],
175
+ downstream: DatasetUrnOrStr,
176
+ ) -> None:
177
+ """Add dataset-to-datajob or dataset-to-mlmodel lineage."""
178
+
179
+ @overload
180
+ def add_lineage(
181
+ self,
182
+ *,
183
+ upstream: Union[DatasetUrnOrStr, DatajobUrnOrStr],
184
+ downstream: DatajobUrnOrStr,
185
+ ) -> None:
186
+ """Add datajob-to-dataset or datajob-to-datajob lineage."""
187
+
188
+ @overload
189
+ def add_lineage(
190
+ self,
191
+ *,
192
+ upstream: Union[DashboardUrnOrStr, DatasetUrnOrStr, ChartUrnOrStr],
193
+ downstream: DashboardUrnOrStr,
194
+ ) -> None:
195
+ """Add dashboard-to-dashboard or dashboard-to-dataset lineage."""
196
+
197
+ @overload
198
+ def add_lineage(
199
+ self,
200
+ *,
201
+ upstream: DatasetUrnOrStr,
202
+ downstream: ChartUrnOrStr,
203
+ ) -> None:
204
+ """Add dataset-to-chart lineage."""
205
+
206
+ # The actual implementation that handles all overloaded cases
207
+ def add_lineage(
208
+ self,
209
+ *,
210
+ upstream: Union[
211
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
212
+ ],
213
+ downstream: Union[
214
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
215
+ ],
216
+ column_lineage: Union[
217
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
218
+ ] = False,
219
+ transformation_text: Optional[str] = None,
220
+ ) -> None:
221
+ """Add lineage between two entities.
222
+
223
+ This flexible method handles different combinations of entity types:
224
+ - dataset to dataset
225
+ - dataset to datajob
226
+ - datajob to dataset
227
+ - datajob to datajob
228
+ - dashboard to dataset
229
+ - dashboard to chart
230
+ - dashboard to dashboard
231
+ - dataset to chart
232
+
233
+ Args:
234
+ upstream: URN of the upstream entity (dataset or datajob)
235
+ downstream: URN of the downstream entity (dataset or datajob)
236
+ column_lineage: Optional boolean to indicate if column-level lineage should be added or a lineage mapping type (auto_fuzzy, auto_strict, or a mapping of column-level lineage)
237
+ transformation_text: Optional SQL query text that defines the transformation
238
+ (only applicable for dataset-to-dataset lineage)
239
+
240
+ Raises:
241
+ InvalidUrnError: If the URNs provided are invalid
242
+ SdkUsageError: If certain parameter combinations are not supported
243
+ """
244
+ # Validate parameter combinations
245
+ upstream_entity_type = Urn.from_string(upstream).entity_type
246
+ downstream_entity_type = Urn.from_string(downstream).entity_type
247
+
248
+ key = (upstream_entity_type, downstream_entity_type)
249
+
250
+ # if it's not dataset-dataset lineage but provided with column_lineage or transformation_text, raise an error
251
+ if key != ("dataset", "dataset") and (column_lineage or transformation_text):
252
+ raise SdkUsageError(
253
+ "Column lineage and query text are only applicable for dataset-to-dataset lineage"
254
+ )
255
+
256
+ lineage_handlers: dict[tuple[str, str], Callable] = {
257
+ ("dataset", "dataset"): self._add_dataset_lineage,
258
+ ("dataset", "dashboard"): self._add_dashboard_lineage,
259
+ ("chart", "dashboard"): self._add_dashboard_lineage,
260
+ ("dashboard", "dashboard"): self._add_dashboard_lineage,
261
+ ("dataset", "dataJob"): self._add_datajob_lineage,
262
+ ("dataJob", "dataJob"): self._add_datajob_lineage,
263
+ ("dataJob", "dataset"): self._add_datajob_output,
264
+ ("dataset", "chart"): self._add_chart_lineage,
265
+ }
266
+
267
+ try:
268
+ lineage_handler = lineage_handlers[key]
269
+ lineage_handler(
270
+ upstream=upstream,
271
+ downstream=downstream,
272
+ upstream_type=upstream_entity_type,
273
+ column_lineage=column_lineage,
274
+ transformation_text=transformation_text,
275
+ )
276
+ except KeyError:
277
+ raise SdkUsageError(
278
+ f"Unsupported entity type combination: {upstream_entity_type} -> {downstream_entity_type}"
279
+ ) from None
280
+
281
+ def _add_dataset_lineage(
282
+ self,
283
+ *,
284
+ upstream,
285
+ downstream,
286
+ column_lineage,
287
+ transformation_text,
288
+ **_,
289
+ ):
290
+ upstream_urn = DatasetUrn.from_string(upstream)
291
+ downstream_urn = DatasetUrn.from_string(downstream)
292
+
293
+ if column_lineage:
294
+ column_lineage = (
295
+ "auto_fuzzy" if column_lineage is True else column_lineage
296
+ ) # if column_lineage is True, set it to auto_fuzzy
297
+ cll = self._process_column_lineage(
298
+ column_lineage, upstream_urn, downstream_urn
299
+ )
300
+ else:
301
+ cll = None
302
+
303
+ if transformation_text:
304
+ self._process_transformation_lineage(
305
+ transformation_text, upstream_urn, downstream_urn, cll
306
+ )
307
+ else:
308
+ updater = DatasetPatchBuilder(str(downstream_urn))
309
+ updater.add_upstream_lineage(
310
+ models.UpstreamClass(
311
+ dataset=str(upstream_urn),
312
+ type=models.DatasetLineageTypeClass.COPY,
313
+ )
314
+ )
315
+ for cl in cll or []:
316
+ updater.add_fine_grained_upstream_lineage(cl)
317
+ self._client.entities.update(updater)
318
+
319
+ def _add_dashboard_lineage(self, *, upstream, downstream, upstream_type, **_):
320
+ patch = DashboardPatchBuilder(str(downstream))
321
+ if upstream_type == "dataset":
322
+ patch.add_dataset_edge(upstream)
323
+ elif upstream_type == "chart":
324
+ patch.add_chart_edge(upstream)
325
+ elif upstream_type == "dashboard":
326
+ patch.add_dashboard(upstream)
327
+ else:
328
+ raise SdkUsageError(
329
+ f"Unsupported entity type combination: {upstream_type} -> dashboard"
330
+ )
331
+ self._client.entities.update(patch)
332
+
333
+ def _add_datajob_lineage(self, *, upstream, downstream, upstream_type, **_):
334
+ patch = DataJobPatchBuilder(str(downstream))
335
+ if upstream_type == "dataset":
336
+ patch.add_input_dataset(upstream)
337
+ elif upstream_type == "dataJob":
338
+ patch.add_input_datajob(upstream)
339
+ else:
340
+ raise SdkUsageError(
341
+ f"Unsupported entity type combination: {upstream_type} -> dataJob"
342
+ )
343
+ self._client.entities.update(patch)
344
+
345
+ def _add_datajob_output(self, *, upstream, downstream, **_):
346
+ patch = DataJobPatchBuilder(str(upstream))
347
+ patch.add_output_dataset(downstream)
348
+ self._client.entities.update(patch)
349
+
350
+ def _add_chart_lineage(self, *, upstream, downstream, **_):
351
+ patch = ChartPatchBuilder(str(downstream))
352
+ patch.add_input_edge(upstream)
353
+ self._client.entities.update(patch)
354
+
355
+ def _process_column_lineage(self, column_lineage, upstream_urn, downstream_urn):
356
+ cll = None
357
+ if column_lineage:
358
+ # Auto column lineage generation
359
+ if column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
360
+ upstream_schema = self._get_fields_from_dataset_urn(upstream_urn)
361
+ downstream_schema = self._get_fields_from_dataset_urn(downstream_urn)
362
+
363
+ # Choose matching strategy
364
+ mapping = (
365
+ self._get_fuzzy_column_lineage(upstream_schema, downstream_schema)
366
+ if column_lineage == "auto_fuzzy"
367
+ else self._get_strict_column_lineage(
368
+ upstream_schema, downstream_schema
369
+ )
370
+ )
371
+ cll = parse_cll_mapping(
372
+ upstream=upstream_urn,
373
+ downstream=downstream_urn,
374
+ cll_mapping=mapping,
375
+ )
376
+ # Explicit column lineage
377
+ elif isinstance(column_lineage, dict):
378
+ cll = parse_cll_mapping(
379
+ upstream=upstream_urn,
380
+ downstream=downstream_urn,
381
+ cll_mapping=column_lineage,
382
+ )
383
+ else:
384
+ assert_never(column_lineage)
385
+ return cll
386
+
387
+ def _process_transformation_lineage(
388
+ self, transformation_text, upstream_urn, downstream_urn, cll
389
+ ):
390
+ fields_involved = OrderedSet([str(upstream_urn), str(downstream_urn)])
391
+ if cll is not None:
392
+ for c in cll:
393
+ for field in c.upstreams or []:
394
+ fields_involved.add(field)
395
+ for field in c.downstreams or []:
396
+ fields_involved.add(field)
397
+
398
+ # Create query URN and entity
399
+ query_urn = QueryUrn(generate_hash(transformation_text)).urn()
400
+ from datahub.sql_parsing.sql_parsing_aggregator import (
401
+ make_query_subjects,
402
+ )
403
+
404
+ query_entity = MetadataChangeProposalWrapper.construct_many(
405
+ query_urn,
406
+ aspects=[
407
+ models.QueryPropertiesClass(
408
+ statement=models.QueryStatementClass(
409
+ value=transformation_text,
410
+ language=models.QueryLanguageClass.SQL,
411
+ ),
412
+ source=models.QuerySourceClass.SYSTEM,
413
+ created=_empty_audit_stamp,
414
+ lastModified=_empty_audit_stamp,
415
+ ),
416
+ make_query_subjects(list(fields_involved)),
417
+ ],
418
+ )
419
+
420
+ # Build dataset update
421
+ updater = DatasetPatchBuilder(str(downstream_urn))
422
+ updater.add_upstream_lineage(
423
+ models.UpstreamClass(
424
+ dataset=str(upstream_urn),
425
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
426
+ query=query_urn,
427
+ )
428
+ )
429
+
430
+ # Add fine-grained lineage
431
+ for cl in cll or []:
432
+ cl.query = query_urn
433
+ updater.add_fine_grained_upstream_lineage(cl)
434
+
435
+ # Check dataset existence
436
+ if not self._client._graph.exists(updater.urn):
437
+ raise SdkUsageError(
438
+ f"Dataset {updater.urn} does not exist, and hence cannot be updated."
439
+ )
440
+
441
+ # Emit metadata change proposals
442
+ mcps: List[
443
+ Union[
444
+ MetadataChangeProposalWrapper,
445
+ models.MetadataChangeProposalClass,
446
+ ]
447
+ ] = list(updater.build())
448
+ if query_entity:
449
+ mcps.extend(query_entity)
450
+ self._client._graph.emit_mcps(mcps)
451
+
452
+ def infer_lineage_from_sql(
453
+ self,
454
+ *,
455
+ query_text: str,
456
+ platform: str,
457
+ platform_instance: Optional[str] = None,
458
+ env: str = "PROD",
459
+ default_db: Optional[str] = None,
460
+ default_schema: Optional[str] = None,
461
+ override_dialect: Optional[str] = None,
462
+ ) -> None:
463
+ """Add lineage by parsing a SQL query."""
464
+ from datahub.sql_parsing.sqlglot_lineage import (
465
+ create_lineage_sql_parsed_result,
466
+ )
467
+
468
+ # Parse the SQL query to extract lineage information
469
+ parsed_result = create_lineage_sql_parsed_result(
470
+ query=query_text,
471
+ default_db=default_db,
472
+ default_schema=default_schema,
473
+ platform=platform,
474
+ platform_instance=platform_instance,
475
+ env=env,
476
+ graph=self._client._graph,
477
+ override_dialect=override_dialect,
478
+ )
479
+
480
+ if parsed_result.debug_info.table_error:
481
+ raise SdkUsageError(
482
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}"
483
+ )
484
+ elif parsed_result.debug_info.column_error:
485
+ logger.warning(
486
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}",
487
+ )
488
+
489
+ if not parsed_result.out_tables:
490
+ raise SdkUsageError(
491
+ "No output tables found in the query. Cannot establish lineage."
492
+ )
493
+
494
+ # Use the first output table as the downstream
495
+ downstream_urn = parsed_result.out_tables[0]
496
+
497
+ # Process all upstream tables found in the query
498
+ for upstream_table in parsed_result.in_tables:
499
+ # Skip self-lineage
500
+ if upstream_table == downstream_urn:
501
+ continue
502
+
503
+ # Extract column-level lineage for this specific upstream table
504
+ column_mapping = {}
505
+ if parsed_result.column_lineage:
506
+ for col_lineage in parsed_result.column_lineage:
507
+ if not (col_lineage.downstream and col_lineage.downstream.column):
508
+ continue
509
+
510
+ # Filter upstreams to only include columns from current upstream table
511
+ upstream_cols = [
512
+ ref.column
513
+ for ref in col_lineage.upstreams
514
+ if ref.table == upstream_table and ref.column
515
+ ]
516
+
517
+ if upstream_cols:
518
+ column_mapping[col_lineage.downstream.column] = upstream_cols
519
+
520
+ # Add lineage, including query text
521
+ self.add_lineage(
522
+ upstream=upstream_table,
523
+ downstream=downstream_urn,
524
+ column_lineage=column_mapping,
525
+ transformation_text=query_text,
526
+ )
527
+
528
+ @deprecated("Use add_lineage instead")
116
529
  def add_dataset_copy_lineage(
117
530
  self,
118
531
  *,
@@ -164,13 +577,14 @@ class LineageClient:
164
577
 
165
578
  self._client.entities.update(updater)
166
579
 
580
+ @deprecated("Use add_lineage instead")
167
581
  def add_dataset_transform_lineage(
168
582
  self,
169
583
  *,
170
584
  upstream: DatasetUrnOrStr,
171
585
  downstream: DatasetUrnOrStr,
172
586
  column_lineage: Optional[ColumnLineageMapping] = None,
173
- query_text: Optional[str] = None,
587
+ transformation_text: Optional[str] = None,
174
588
  ) -> None:
175
589
  upstream = DatasetUrn.from_string(upstream)
176
590
  downstream = DatasetUrn.from_string(downstream)
@@ -193,9 +607,9 @@ class LineageClient:
193
607
 
194
608
  query_urn = None
195
609
  query_entity = None
196
- if query_text:
610
+ if transformation_text:
197
611
  # Eventually we might want to use our regex-based fingerprinting instead.
198
- fingerprint = generate_hash(query_text)
612
+ fingerprint = generate_hash(transformation_text)
199
613
  query_urn = QueryUrn(fingerprint).urn()
200
614
 
201
615
  from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
@@ -205,7 +619,8 @@ class LineageClient:
205
619
  aspects=[
206
620
  models.QueryPropertiesClass(
207
621
  statement=models.QueryStatementClass(
208
- value=query_text, language=models.QueryLanguageClass.SQL
622
+ value=transformation_text,
623
+ language=models.QueryLanguageClass.SQL,
209
624
  ),
210
625
  source=models.QuerySourceClass.SYSTEM,
211
626
  created=_empty_audit_stamp,
@@ -242,80 +657,7 @@ class LineageClient:
242
657
  mcps.extend(query_entity)
243
658
  self._client._graph.emit_mcps(mcps)
244
659
 
245
- def add_dataset_lineage_from_sql(
246
- self,
247
- *,
248
- query_text: str,
249
- platform: str,
250
- platform_instance: Optional[str] = None,
251
- env: str = "PROD",
252
- default_db: Optional[str] = None,
253
- default_schema: Optional[str] = None,
254
- ) -> None:
255
- """Add lineage by parsing a SQL query."""
256
- from datahub.sql_parsing.sqlglot_lineage import (
257
- create_lineage_sql_parsed_result,
258
- )
259
-
260
- # Parse the SQL query to extract lineage information
261
- parsed_result = create_lineage_sql_parsed_result(
262
- query=query_text,
263
- default_db=default_db,
264
- default_schema=default_schema,
265
- platform=platform,
266
- platform_instance=platform_instance,
267
- env=env,
268
- graph=self._client._graph,
269
- )
270
-
271
- if parsed_result.debug_info.table_error:
272
- raise SdkUsageError(
273
- f"Failed to parse SQL query: {parsed_result.debug_info.error}"
274
- )
275
- elif parsed_result.debug_info.column_error:
276
- logger.warning(
277
- f"Failed to parse SQL query: {parsed_result.debug_info.error}",
278
- )
279
-
280
- if not parsed_result.out_tables:
281
- raise SdkUsageError(
282
- "No output tables found in the query. Cannot establish lineage."
283
- )
284
-
285
- # Use the first output table as the downstream
286
- downstream_urn = parsed_result.out_tables[0]
287
-
288
- # Process all upstream tables found in the query
289
- for upstream_table in parsed_result.in_tables:
290
- # Skip self-lineage
291
- if upstream_table == downstream_urn:
292
- continue
293
-
294
- # Extract column-level lineage for this specific upstream table
295
- column_mapping = {}
296
- if parsed_result.column_lineage:
297
- for col_lineage in parsed_result.column_lineage:
298
- if not (col_lineage.downstream and col_lineage.downstream.column):
299
- continue
300
-
301
- # Filter upstreams to only include columns from current upstream table
302
- upstream_cols = [
303
- ref.column
304
- for ref in col_lineage.upstreams
305
- if ref.table == upstream_table and ref.column
306
- ]
307
-
308
- if upstream_cols:
309
- column_mapping[col_lineage.downstream.column] = upstream_cols
310
-
311
- # Add lineage, including query text
312
- self.add_dataset_transform_lineage(
313
- upstream=upstream_table,
314
- downstream=downstream_urn,
315
- column_lineage=column_mapping or None,
316
- query_text=query_text,
317
- )
318
-
660
+ @deprecated("Use add_lineage instead")
319
661
  def add_datajob_lineage(
320
662
  self,
321
663
  *,
@@ -360,3 +702,242 @@ class LineageClient:
360
702
 
361
703
  # Apply the changes to the entity
362
704
  self._client.entities.update(patch_builder)
705
+
706
+ def get_lineage(
707
+ self,
708
+ *,
709
+ source_urn: Union[str, Urn],
710
+ source_column: Optional[str] = None,
711
+ direction: Literal["upstream", "downstream"] = "upstream",
712
+ max_hops: int = 1,
713
+ filter: Optional[Filter] = None,
714
+ count: int = 500,
715
+ ) -> List[LineageResult]:
716
+ """
717
+ Retrieve lineage entities connected to a source entity.
718
+ Args:
719
+ source_urn: Source URN for the lineage search
720
+ source_column: Source column for the lineage search
721
+ direction: Direction of lineage traversal
722
+ max_hops: Maximum number of hops to traverse
723
+ filter: Filters to apply to the lineage search
724
+ count: Maximum number of results to return
725
+
726
+ Returns:
727
+ List of lineage results
728
+
729
+ Raises:
730
+ SdkUsageError for invalid filter values
731
+ """
732
+ # Validate and convert input URN
733
+ source_urn = Urn.from_string(source_urn)
734
+ # Prepare GraphQL query variables with a separate method
735
+ variables = self._process_input_variables(
736
+ source_urn, source_column, filter, direction, max_hops, count
737
+ )
738
+
739
+ return self._execute_lineage_query(variables, direction)
740
+
741
+ def _process_input_variables(
742
+ self,
743
+ source_urn: Urn,
744
+ source_column: Optional[str] = None,
745
+ filters: Optional[Filter] = None,
746
+ direction: Literal["upstream", "downstream"] = "upstream",
747
+ max_hops: int = 1,
748
+ count: int = 500,
749
+ ) -> Dict[str, Any]:
750
+ """
751
+ Process filters and prepare GraphQL query variables for lineage search.
752
+
753
+ Args:
754
+ source_urn: Source URN for the lineage search
755
+ source_column: Source column for the lineage search
756
+ filters: Optional filters to apply
757
+ direction: Direction of lineage traversal
758
+ max_hops: Maximum number of hops to traverse
759
+ count: Maximum number of results to return
760
+
761
+ Returns:
762
+ Dictionary of GraphQL query variables
763
+
764
+ Raises:
765
+ SdkUsageError for invalid filter values
766
+ """
767
+
768
+ # print warning if max_hops is greater than 2
769
+ if max_hops > 2:
770
+ logger.warning(
771
+ """If `max_hops` is more than 2, the search will try to find the full lineage graph.
772
+ By default, only 500 results are shown.
773
+ You can change the `count` to get more or fewer results.
774
+ """
775
+ )
776
+
777
+ # Determine hop values
778
+ max_hop_values = (
779
+ [str(hop) for hop in range(1, max_hops + 1)]
780
+ if max_hops <= 2
781
+ else ["1", "2", "3+"]
782
+ )
783
+
784
+ degree_filter = FilterDsl.custom_filter(
785
+ field="degree",
786
+ condition="EQUAL",
787
+ values=max_hop_values,
788
+ )
789
+
790
+ filters_with_max_hops = (
791
+ FilterDsl.and_(degree_filter, filters)
792
+ if filters is not None
793
+ else degree_filter
794
+ )
795
+
796
+ types, compiled_filters = compile_filters(filters_with_max_hops)
797
+
798
+ # Prepare base variables
799
+ variables: Dict[str, Any] = {
800
+ "input": {
801
+ "urn": str(source_urn),
802
+ "direction": direction.upper(),
803
+ "count": count,
804
+ "types": types,
805
+ "orFilters": compiled_filters,
806
+ }
807
+ }
808
+
809
+ # if column is provided, update the variables to include the schema field urn
810
+ if isinstance(source_urn, SchemaFieldUrn) or source_column:
811
+ variables["input"]["searchFlags"] = {
812
+ "groupingSpec": {
813
+ "groupingCriteria": {
814
+ "baseEntityType": "SCHEMA_FIELD",
815
+ "groupingEntityType": "SCHEMA_FIELD",
816
+ }
817
+ }
818
+ }
819
+ if isinstance(source_urn, SchemaFieldUrn):
820
+ variables["input"]["urn"] = str(source_urn)
821
+ elif source_column:
822
+ variables["input"]["urn"] = str(SchemaFieldUrn(source_urn, source_column))
823
+
824
+ return variables
825
+
826
+ def _execute_lineage_query(
827
+ self,
828
+ variables: Dict[str, Any],
829
+ direction: Literal["upstream", "downstream"],
830
+ ) -> List[LineageResult]:
831
+ """Execute GraphQL query and process results."""
832
+ # Construct GraphQL query with dynamic path query
833
+ graphql_query = """
834
+ query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
835
+ scrollAcrossLineage(input: $input) {
836
+ nextScrollId
837
+ searchResults {
838
+ degree
839
+ entity {
840
+ urn
841
+ type
842
+ ... on Dataset {
843
+ name
844
+ platform {
845
+ name
846
+ }
847
+ properties {
848
+ description
849
+ }
850
+ }
851
+ ... on DataJob {
852
+ jobId
853
+ dataPlatformInstance {
854
+ platform {
855
+ name
856
+ }
857
+ }
858
+ properties {
859
+ name
860
+ description
861
+ }
862
+ }
863
+ }
864
+ paths {
865
+ path {
866
+ urn
867
+ type
868
+ }
869
+ }
870
+ }
871
+ }
872
+ }
873
+ """
874
+
875
+ results: List[LineageResult] = []
876
+
877
+ first_iter = True
878
+ scroll_id: Optional[str] = None
879
+
880
+ while first_iter or scroll_id:
881
+ first_iter = False
882
+
883
+ # Update scroll ID if applicable
884
+ if scroll_id:
885
+ variables["input"]["scrollId"] = scroll_id
886
+
887
+ # Execute GraphQL query
888
+ response = self._graph.execute_graphql(graphql_query, variables=variables)
889
+ data = response["scrollAcrossLineage"]
890
+ scroll_id = data.get("nextScrollId")
891
+
892
+ # Process search results
893
+ for entry in data["searchResults"]:
894
+ entity = entry["entity"]
895
+
896
+ result = self._create_lineage_result(entity, entry, direction)
897
+ results.append(result)
898
+
899
+ return results
900
+
901
+ def _create_lineage_result(
902
+ self,
903
+ entity: Dict[str, Any],
904
+ entry: Dict[str, Any],
905
+ direction: Literal["upstream", "downstream"],
906
+ ) -> LineageResult:
907
+ """Create a LineageResult from entity and entry data."""
908
+ platform = (entity.get("platform") or {}).get("name") or (
909
+ (entity.get("dataPlatformInstance") or {}).get("platform") or {}
910
+ ).get("name")
911
+
912
+ result = LineageResult(
913
+ urn=entity["urn"],
914
+ type=entity["type"],
915
+ hops=entry["degree"],
916
+ direction=direction,
917
+ platform=platform,
918
+ )
919
+
920
+ properties = entity.get("properties", {})
921
+ if properties:
922
+ result.name = properties.get("name", "")
923
+ result.description = properties.get("description", "")
924
+
925
+ result.paths = []
926
+ if "paths" in entry:
927
+ # Process each path in the lineage graph
928
+ for path in entry["paths"]:
929
+ for path_entry in path["path"]:
930
+ # Only include schema fields in the path (exclude other types like Query)
931
+ if path_entry["type"] == "SCHEMA_FIELD":
932
+ schema_field_urn = SchemaFieldUrn.from_string(path_entry["urn"])
933
+ result.paths.append(
934
+ LineagePath(
935
+ urn=path_entry["urn"],
936
+ entity_name=DatasetUrn.from_string(
937
+ schema_field_urn.parent
938
+ ).name,
939
+ column_name=schema_field_urn.field_path,
940
+ )
941
+ )
942
+
943
+ return result