acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (221) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/emitter/rest_emitter.py +70 -12
  36. datahub/entrypoints.py +4 -3
  37. datahub/ingestion/api/decorators.py +15 -3
  38. datahub/ingestion/api/report.py +332 -3
  39. datahub/ingestion/api/sink.py +3 -0
  40. datahub/ingestion/api/source.py +48 -44
  41. datahub/ingestion/autogenerated/__init__.py +0 -0
  42. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  43. datahub/ingestion/autogenerated/lineage.json +401 -0
  44. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  45. datahub/ingestion/extractor/schema_util.py +13 -4
  46. datahub/ingestion/glossary/classification_mixin.py +5 -0
  47. datahub/ingestion/graph/client.py +100 -15
  48. datahub/ingestion/graph/config.py +1 -0
  49. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  50. datahub/ingestion/run/pipeline.py +54 -2
  51. datahub/ingestion/sink/datahub_rest.py +13 -0
  52. datahub/ingestion/source/abs/source.py +1 -1
  53. datahub/ingestion/source/aws/aws_common.py +4 -0
  54. datahub/ingestion/source/aws/glue.py +489 -244
  55. datahub/ingestion/source/aws/tag_entities.py +292 -0
  56. datahub/ingestion/source/azure/azure_common.py +2 -2
  57. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  58. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  59. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  60. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  61. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  62. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  63. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  64. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  65. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  66. datahub/ingestion/source/common/subtypes.py +45 -0
  67. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  68. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  69. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  70. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  71. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  72. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  73. datahub/ingestion/source/debug/__init__.py +0 -0
  74. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  75. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  76. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  77. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  78. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  79. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  80. datahub/ingestion/source/file.py +3 -0
  81. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  82. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  83. datahub/ingestion/source/ge_data_profiler.py +76 -28
  84. datahub/ingestion/source/ge_profiling_config.py +11 -0
  85. datahub/ingestion/source/hex/api.py +26 -1
  86. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +1 -1
  88. datahub/ingestion/source/identity/okta.py +1 -14
  89. datahub/ingestion/source/kafka/kafka.py +16 -0
  90. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  91. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  92. datahub/ingestion/source/looker/looker_source.py +1 -0
  93. datahub/ingestion/source/mlflow.py +11 -1
  94. datahub/ingestion/source/mock_data/__init__.py +0 -0
  95. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  97. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  98. datahub/ingestion/source/nifi.py +1 -1
  99. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  100. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  101. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  102. datahub/ingestion/source/preset.py +2 -2
  103. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  104. datahub/ingestion/source/redshift/redshift.py +21 -1
  105. datahub/ingestion/source/redshift/usage.py +4 -3
  106. datahub/ingestion/source/s3/report.py +4 -2
  107. datahub/ingestion/source/s3/source.py +367 -115
  108. datahub/ingestion/source/sac/sac.py +3 -1
  109. datahub/ingestion/source/salesforce.py +6 -3
  110. datahub/ingestion/source/sigma/sigma.py +7 -1
  111. datahub/ingestion/source/slack/slack.py +2 -1
  112. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  113. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  114. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  115. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  116. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  117. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  118. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  119. datahub/ingestion/source/sql/athena.py +119 -11
  120. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  121. datahub/ingestion/source/sql/clickhouse.py +3 -1
  122. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  123. datahub/ingestion/source/sql/hana.py +3 -1
  124. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  125. datahub/ingestion/source/sql/mariadb.py +0 -1
  126. datahub/ingestion/source/sql/mssql/source.py +239 -34
  127. datahub/ingestion/source/sql/mysql.py +0 -1
  128. datahub/ingestion/source/sql/oracle.py +1 -1
  129. datahub/ingestion/source/sql/postgres.py +0 -1
  130. datahub/ingestion/source/sql/sql_common.py +121 -34
  131. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  132. datahub/ingestion/source/sql/teradata.py +997 -235
  133. datahub/ingestion/source/sql/vertica.py +10 -6
  134. datahub/ingestion/source/sql_queries.py +2 -2
  135. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  136. datahub/ingestion/source/superset.py +58 -3
  137. datahub/ingestion/source/tableau/tableau.py +58 -37
  138. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  139. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  140. datahub/ingestion/source/unity/config.py +5 -0
  141. datahub/ingestion/source/unity/proxy.py +118 -0
  142. datahub/ingestion/source/unity/source.py +195 -17
  143. datahub/ingestion/source/unity/tag_entities.py +295 -0
  144. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  145. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  146. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  147. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  148. datahub/metadata/_internal_schema_classes.py +1433 -546
  149. datahub/metadata/_urns/urn_defs.py +1826 -1658
  150. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  151. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  157. datahub/metadata/schema.avsc +17736 -17112
  158. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  159. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  160. datahub/metadata/schemas/Applications.avsc +38 -0
  161. datahub/metadata/schemas/ChartKey.avsc +1 -0
  162. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  164. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  165. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  166. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  167. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  168. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  169. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  170. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  171. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  172. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  173. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  176. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  177. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  178. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  179. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  180. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  181. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  182. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  183. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  184. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  185. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  186. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  187. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  188. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  189. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  190. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  191. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  192. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  193. datahub/metadata/schemas/__init__.py +3 -3
  194. datahub/sdk/__init__.py +2 -0
  195. datahub/sdk/_all_entities.py +7 -0
  196. datahub/sdk/_shared.py +116 -0
  197. datahub/sdk/chart.py +315 -0
  198. datahub/sdk/container.py +7 -0
  199. datahub/sdk/dashboard.py +432 -0
  200. datahub/sdk/dataflow.py +7 -0
  201. datahub/sdk/datajob.py +45 -13
  202. datahub/sdk/dataset.py +8 -2
  203. datahub/sdk/entity_client.py +82 -2
  204. datahub/sdk/lineage_client.py +683 -82
  205. datahub/sdk/main_client.py +46 -16
  206. datahub/sdk/mlmodel.py +101 -38
  207. datahub/sdk/mlmodelgroup.py +7 -0
  208. datahub/sdk/search_client.py +4 -3
  209. datahub/specific/chart.py +1 -1
  210. datahub/specific/dataproduct.py +4 -0
  211. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  212. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  213. datahub/telemetry/telemetry.py +17 -11
  214. datahub/testing/sdk_v2_helpers.py +7 -1
  215. datahub/upgrade/upgrade.py +46 -13
  216. datahub/utilities/server_config_util.py +8 -0
  217. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  218. datahub/utilities/stats_collections.py +4 -0
  219. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
  220. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,38 @@ from __future__ import annotations
2
2
 
3
3
  import difflib
4
4
  import logging
5
- from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Set,
15
+ Union,
16
+ overload,
17
+ )
6
18
 
7
- from typing_extensions import assert_never
19
+ from typing_extensions import assert_never, deprecated
8
20
 
9
21
  import datahub.metadata.schema_classes as models
10
22
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
23
  from datahub.errors import SdkUsageError
12
- from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn
13
- from datahub.sdk._shared import DatajobUrnOrStr, DatasetUrnOrStr
24
+ from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn, SchemaFieldUrn, Urn
25
+ from datahub.sdk._shared import (
26
+ ChartUrnOrStr,
27
+ DashboardUrnOrStr,
28
+ DatajobUrnOrStr,
29
+ DatasetUrnOrStr,
30
+ )
14
31
  from datahub.sdk._utils import DEFAULT_ACTOR_URN
15
32
  from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
33
+ from datahub.sdk.search_client import compile_filters
34
+ from datahub.sdk.search_filters import Filter, FilterDsl
35
+ from datahub.specific.chart import ChartPatchBuilder
36
+ from datahub.specific.dashboard import DashboardPatchBuilder
16
37
  from datahub.specific.datajob import DataJobPatchBuilder
17
38
  from datahub.specific.dataset import DatasetPatchBuilder
18
39
  from datahub.sql_parsing.fingerprint_utils import generate_hash
@@ -32,9 +53,29 @@ _empty_audit_stamp = models.AuditStampClass(
32
53
  logger = logging.getLogger(__name__)
33
54
 
34
55
 
56
+ @dataclass
57
+ class LineagePath:
58
+ urn: str
59
+ entity_name: str
60
+ column_name: Optional[str] = None
61
+
62
+
63
+ @dataclass
64
+ class LineageResult:
65
+ urn: str
66
+ type: str
67
+ hops: int
68
+ direction: Literal["upstream", "downstream"]
69
+ platform: Optional[str] = None
70
+ name: Optional[str] = None
71
+ description: Optional[str] = None
72
+ paths: Optional[List[LineagePath]] = None
73
+
74
+
35
75
  class LineageClient:
36
76
  def __init__(self, client: DataHubClient):
37
77
  self._client = client
78
+ self._graph = client._graph
38
79
 
39
80
  def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
40
81
  schema_metadata = self._client._graph.get_aspect(
@@ -113,6 +154,398 @@ class LineageClient:
113
154
 
114
155
  return fuzzy_column_lineage
115
156
 
157
+ @overload
158
+ def add_lineage(
159
+ self,
160
+ *,
161
+ upstream: DatasetUrnOrStr,
162
+ downstream: DatasetUrnOrStr,
163
+ column_lineage: Union[
164
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
165
+ ] = False,
166
+ transformation_text: Optional[str] = None,
167
+ ) -> None:
168
+ ...
169
+
170
+ """
171
+ Add dataset-to-dataset lineage with column-level mapping.
172
+ """
173
+
174
+ @overload
175
+ def add_lineage(
176
+ self,
177
+ *,
178
+ upstream: Union[DatajobUrnOrStr],
179
+ downstream: DatasetUrnOrStr,
180
+ ) -> None:
181
+ ...
182
+
183
+ """
184
+ Add dataset-to-datajob or dataset-to-mlmodel lineage.
185
+ """
186
+
187
+ @overload
188
+ def add_lineage(
189
+ self,
190
+ *,
191
+ upstream: Union[DatasetUrnOrStr, DatajobUrnOrStr],
192
+ downstream: DatajobUrnOrStr,
193
+ ) -> None:
194
+ ...
195
+
196
+ """
197
+ Add datajob-to-dataset or datajob-to-datajob lineage.
198
+ """
199
+
200
+ @overload
201
+ def add_lineage(
202
+ self,
203
+ *,
204
+ upstream: Union[DashboardUrnOrStr, DatasetUrnOrStr, ChartUrnOrStr],
205
+ downstream: DashboardUrnOrStr,
206
+ ) -> None:
207
+ ...
208
+
209
+ """
210
+ Add dashboard-to-dashboard or dashboard-to-dataset lineage.
211
+ """
212
+
213
+ @overload
214
+ def add_lineage(
215
+ self,
216
+ *,
217
+ upstream: DatasetUrnOrStr,
218
+ downstream: ChartUrnOrStr,
219
+ ) -> None:
220
+ ...
221
+ """
222
+ Add dataset-to-chart lineage.
223
+ """
224
+
225
+ # The actual implementation that handles all overloaded cases
226
+ def add_lineage(
227
+ self,
228
+ *,
229
+ upstream: Union[
230
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
231
+ ],
232
+ downstream: Union[
233
+ DatasetUrnOrStr, DatajobUrnOrStr, DashboardUrnOrStr, ChartUrnOrStr
234
+ ],
235
+ column_lineage: Union[
236
+ bool, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
237
+ ] = False,
238
+ transformation_text: Optional[str] = None,
239
+ ) -> None:
240
+ """
241
+ Add lineage between two entities.
242
+
243
+ This flexible method handles different combinations of entity types:
244
+ - dataset to dataset
245
+ - dataset to datajob
246
+ - datajob to dataset
247
+ - datajob to datajob
248
+ - dashboard to dataset
249
+ - dashboard to chart
250
+ - dashboard to dashboard
251
+ - dataset to chart
252
+
253
+ Args:
254
+ upstream: URN of the upstream entity (dataset or datajob)
255
+ downstream: URN of the downstream entity (dataset or datajob)
256
+ column_lineage: Optional boolean to indicate if column-level lineage should be added or a lineage mapping type (auto_fuzzy, auto_strict, or a mapping of column-level lineage)
257
+ transformation_text: Optional SQL query text that defines the transformation
258
+ (only applicable for dataset-to-dataset lineage)
259
+
260
+ Raises:
261
+ InvalidUrnError: If the URNs provided are invalid
262
+ SdkUsageError: If certain parameter combinations are not supported
263
+ """
264
+ # Validate parameter combinations
265
+ upstream_entity_type = Urn.from_string(upstream).entity_type
266
+ downstream_entity_type = Urn.from_string(downstream).entity_type
267
+
268
+ key = (upstream_entity_type, downstream_entity_type)
269
+
270
+ # if it's not dataset-dataset lineage but provided with column_lineage or transformation_text, raise an error
271
+ if key != ("dataset", "dataset") and (column_lineage or transformation_text):
272
+ raise SdkUsageError(
273
+ "Column lineage and query text are only applicable for dataset-to-dataset lineage"
274
+ )
275
+
276
+ lineage_handlers: dict[tuple[str, str], Callable] = {
277
+ ("dataset", "dataset"): self._add_dataset_lineage,
278
+ ("dataset", "dashboard"): self._add_dashboard_lineage,
279
+ ("chart", "dashboard"): self._add_dashboard_lineage,
280
+ ("dashboard", "dashboard"): self._add_dashboard_lineage,
281
+ ("dataset", "dataJob"): self._add_datajob_lineage,
282
+ ("dataJob", "dataJob"): self._add_datajob_lineage,
283
+ ("dataJob", "dataset"): self._add_datajob_output,
284
+ ("dataset", "chart"): self._add_chart_lineage,
285
+ }
286
+
287
+ try:
288
+ lineage_handler = lineage_handlers[key]
289
+ lineage_handler(
290
+ upstream=upstream,
291
+ downstream=downstream,
292
+ upstream_type=upstream_entity_type,
293
+ column_lineage=column_lineage,
294
+ transformation_text=transformation_text,
295
+ )
296
+ except KeyError:
297
+ raise SdkUsageError(
298
+ f"Unsupported entity type combination: {upstream_entity_type} -> {downstream_entity_type}"
299
+ ) from None
300
+
301
+ def _add_dataset_lineage(
302
+ self,
303
+ *,
304
+ upstream,
305
+ downstream,
306
+ column_lineage,
307
+ transformation_text,
308
+ **_,
309
+ ):
310
+ upstream_urn = DatasetUrn.from_string(upstream)
311
+ downstream_urn = DatasetUrn.from_string(downstream)
312
+
313
+ if column_lineage:
314
+ column_lineage = (
315
+ "auto_fuzzy" if column_lineage is True else column_lineage
316
+ ) # if column_lineage is True, set it to auto_fuzzy
317
+ cll = self._process_column_lineage(
318
+ column_lineage, upstream_urn, downstream_urn
319
+ )
320
+ else:
321
+ cll = None
322
+
323
+ if transformation_text:
324
+ self._process_transformation_lineage(
325
+ transformation_text, upstream_urn, downstream_urn, cll
326
+ )
327
+ else:
328
+ updater = DatasetPatchBuilder(str(downstream_urn))
329
+ updater.add_upstream_lineage(
330
+ models.UpstreamClass(
331
+ dataset=str(upstream_urn),
332
+ type=models.DatasetLineageTypeClass.COPY,
333
+ )
334
+ )
335
+ for cl in cll or []:
336
+ updater.add_fine_grained_upstream_lineage(cl)
337
+ self._client.entities.update(updater)
338
+
339
+ def _add_dashboard_lineage(self, *, upstream, downstream, upstream_type, **_):
340
+ patch = DashboardPatchBuilder(str(downstream))
341
+ if upstream_type == "dataset":
342
+ patch.add_dataset_edge(upstream)
343
+ elif upstream_type == "chart":
344
+ patch.add_chart_edge(upstream)
345
+ elif upstream_type == "dashboard":
346
+ patch.add_dashboard(upstream)
347
+ else:
348
+ raise SdkUsageError(
349
+ f"Unsupported entity type combination: {upstream_type} -> dashboard"
350
+ )
351
+ self._client.entities.update(patch)
352
+
353
+ def _add_datajob_lineage(self, *, upstream, downstream, upstream_type, **_):
354
+ patch = DataJobPatchBuilder(str(downstream))
355
+ if upstream_type == "dataset":
356
+ patch.add_input_dataset(upstream)
357
+ elif upstream_type == "dataJob":
358
+ patch.add_input_datajob(upstream)
359
+ else:
360
+ raise SdkUsageError(
361
+ f"Unsupported entity type combination: {upstream_type} -> dataJob"
362
+ )
363
+ self._client.entities.update(patch)
364
+
365
+ def _add_datajob_output(self, *, upstream, downstream, **_):
366
+ patch = DataJobPatchBuilder(str(upstream))
367
+ patch.add_output_dataset(downstream)
368
+ self._client.entities.update(patch)
369
+
370
+ def _add_chart_lineage(self, *, upstream, downstream, **_):
371
+ patch = ChartPatchBuilder(str(downstream))
372
+ patch.add_input_edge(upstream)
373
+ self._client.entities.update(patch)
374
+
375
+ def _process_column_lineage(self, column_lineage, upstream_urn, downstream_urn):
376
+ cll = None
377
+ if column_lineage:
378
+ # Auto column lineage generation
379
+ if column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
380
+ upstream_schema = self._get_fields_from_dataset_urn(upstream_urn)
381
+ downstream_schema = self._get_fields_from_dataset_urn(downstream_urn)
382
+
383
+ # Choose matching strategy
384
+ mapping = (
385
+ self._get_fuzzy_column_lineage(upstream_schema, downstream_schema)
386
+ if column_lineage == "auto_fuzzy"
387
+ else self._get_strict_column_lineage(
388
+ upstream_schema, downstream_schema
389
+ )
390
+ )
391
+ cll = parse_cll_mapping(
392
+ upstream=upstream_urn,
393
+ downstream=downstream_urn,
394
+ cll_mapping=mapping,
395
+ )
396
+ # Explicit column lineage
397
+ elif isinstance(column_lineage, dict):
398
+ cll = parse_cll_mapping(
399
+ upstream=upstream_urn,
400
+ downstream=downstream_urn,
401
+ cll_mapping=column_lineage,
402
+ )
403
+ else:
404
+ assert_never(column_lineage)
405
+ return cll
406
+
407
+ def _process_transformation_lineage(
408
+ self, transformation_text, upstream_urn, downstream_urn, cll
409
+ ):
410
+ fields_involved = OrderedSet([str(upstream_urn), str(downstream_urn)])
411
+ if cll is not None:
412
+ for c in cll:
413
+ for field in c.upstreams or []:
414
+ fields_involved.add(field)
415
+ for field in c.downstreams or []:
416
+ fields_involved.add(field)
417
+
418
+ # Create query URN and entity
419
+ query_urn = QueryUrn(generate_hash(transformation_text)).urn()
420
+ from datahub.sql_parsing.sql_parsing_aggregator import (
421
+ make_query_subjects,
422
+ )
423
+
424
+ query_entity = MetadataChangeProposalWrapper.construct_many(
425
+ query_urn,
426
+ aspects=[
427
+ models.QueryPropertiesClass(
428
+ statement=models.QueryStatementClass(
429
+ value=transformation_text,
430
+ language=models.QueryLanguageClass.SQL,
431
+ ),
432
+ source=models.QuerySourceClass.SYSTEM,
433
+ created=_empty_audit_stamp,
434
+ lastModified=_empty_audit_stamp,
435
+ ),
436
+ make_query_subjects(list(fields_involved)),
437
+ ],
438
+ )
439
+
440
+ # Build dataset update
441
+ updater = DatasetPatchBuilder(str(downstream_urn))
442
+ updater.add_upstream_lineage(
443
+ models.UpstreamClass(
444
+ dataset=str(upstream_urn),
445
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
446
+ query=query_urn,
447
+ )
448
+ )
449
+
450
+ # Add fine-grained lineage
451
+ for cl in cll or []:
452
+ cl.query = query_urn
453
+ updater.add_fine_grained_upstream_lineage(cl)
454
+
455
+ # Check dataset existence
456
+ if not self._client._graph.exists(updater.urn):
457
+ raise SdkUsageError(
458
+ f"Dataset {updater.urn} does not exist, and hence cannot be updated."
459
+ )
460
+
461
+ # Emit metadata change proposals
462
+ mcps: List[
463
+ Union[
464
+ MetadataChangeProposalWrapper,
465
+ models.MetadataChangeProposalClass,
466
+ ]
467
+ ] = list(updater.build())
468
+ if query_entity:
469
+ mcps.extend(query_entity)
470
+ self._client._graph.emit_mcps(mcps)
471
+
472
+ def infer_lineage_from_sql(
473
+ self,
474
+ *,
475
+ query_text: str,
476
+ platform: str,
477
+ platform_instance: Optional[str] = None,
478
+ env: str = "PROD",
479
+ default_db: Optional[str] = None,
480
+ default_schema: Optional[str] = None,
481
+ override_dialect: Optional[str] = None,
482
+ ) -> None:
483
+ """Add lineage by parsing a SQL query."""
484
+ from datahub.sql_parsing.sqlglot_lineage import (
485
+ create_lineage_sql_parsed_result,
486
+ )
487
+
488
+ # Parse the SQL query to extract lineage information
489
+ parsed_result = create_lineage_sql_parsed_result(
490
+ query=query_text,
491
+ default_db=default_db,
492
+ default_schema=default_schema,
493
+ platform=platform,
494
+ platform_instance=platform_instance,
495
+ env=env,
496
+ graph=self._client._graph,
497
+ override_dialect=override_dialect,
498
+ )
499
+
500
+ if parsed_result.debug_info.table_error:
501
+ raise SdkUsageError(
502
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}"
503
+ )
504
+ elif parsed_result.debug_info.column_error:
505
+ logger.warning(
506
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}",
507
+ )
508
+
509
+ if not parsed_result.out_tables:
510
+ raise SdkUsageError(
511
+ "No output tables found in the query. Cannot establish lineage."
512
+ )
513
+
514
+ # Use the first output table as the downstream
515
+ downstream_urn = parsed_result.out_tables[0]
516
+
517
+ # Process all upstream tables found in the query
518
+ for upstream_table in parsed_result.in_tables:
519
+ # Skip self-lineage
520
+ if upstream_table == downstream_urn:
521
+ continue
522
+
523
+ # Extract column-level lineage for this specific upstream table
524
+ column_mapping = {}
525
+ if parsed_result.column_lineage:
526
+ for col_lineage in parsed_result.column_lineage:
527
+ if not (col_lineage.downstream and col_lineage.downstream.column):
528
+ continue
529
+
530
+ # Filter upstreams to only include columns from current upstream table
531
+ upstream_cols = [
532
+ ref.column
533
+ for ref in col_lineage.upstreams
534
+ if ref.table == upstream_table and ref.column
535
+ ]
536
+
537
+ if upstream_cols:
538
+ column_mapping[col_lineage.downstream.column] = upstream_cols
539
+
540
+ # Add lineage, including query text
541
+ self.add_lineage(
542
+ upstream=upstream_table,
543
+ downstream=downstream_urn,
544
+ column_lineage=column_mapping,
545
+ transformation_text=query_text,
546
+ )
547
+
548
+ @deprecated("Use add_lineage instead")
116
549
  def add_dataset_copy_lineage(
117
550
  self,
118
551
  *,
@@ -164,13 +597,14 @@ class LineageClient:
164
597
 
165
598
  self._client.entities.update(updater)
166
599
 
600
+ @deprecated("Use add_lineage instead")
167
601
  def add_dataset_transform_lineage(
168
602
  self,
169
603
  *,
170
604
  upstream: DatasetUrnOrStr,
171
605
  downstream: DatasetUrnOrStr,
172
606
  column_lineage: Optional[ColumnLineageMapping] = None,
173
- query_text: Optional[str] = None,
607
+ transformation_text: Optional[str] = None,
174
608
  ) -> None:
175
609
  upstream = DatasetUrn.from_string(upstream)
176
610
  downstream = DatasetUrn.from_string(downstream)
@@ -193,9 +627,9 @@ class LineageClient:
193
627
 
194
628
  query_urn = None
195
629
  query_entity = None
196
- if query_text:
630
+ if transformation_text:
197
631
  # Eventually we might want to use our regex-based fingerprinting instead.
198
- fingerprint = generate_hash(query_text)
632
+ fingerprint = generate_hash(transformation_text)
199
633
  query_urn = QueryUrn(fingerprint).urn()
200
634
 
201
635
  from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
@@ -205,7 +639,8 @@ class LineageClient:
205
639
  aspects=[
206
640
  models.QueryPropertiesClass(
207
641
  statement=models.QueryStatementClass(
208
- value=query_text, language=models.QueryLanguageClass.SQL
642
+ value=transformation_text,
643
+ language=models.QueryLanguageClass.SQL,
209
644
  ),
210
645
  source=models.QuerySourceClass.SYSTEM,
211
646
  created=_empty_audit_stamp,
@@ -242,80 +677,7 @@ class LineageClient:
242
677
  mcps.extend(query_entity)
243
678
  self._client._graph.emit_mcps(mcps)
244
679
 
245
- def add_dataset_lineage_from_sql(
246
- self,
247
- *,
248
- query_text: str,
249
- platform: str,
250
- platform_instance: Optional[str] = None,
251
- env: str = "PROD",
252
- default_db: Optional[str] = None,
253
- default_schema: Optional[str] = None,
254
- ) -> None:
255
- """Add lineage by parsing a SQL query."""
256
- from datahub.sql_parsing.sqlglot_lineage import (
257
- create_lineage_sql_parsed_result,
258
- )
259
-
260
- # Parse the SQL query to extract lineage information
261
- parsed_result = create_lineage_sql_parsed_result(
262
- query=query_text,
263
- default_db=default_db,
264
- default_schema=default_schema,
265
- platform=platform,
266
- platform_instance=platform_instance,
267
- env=env,
268
- graph=self._client._graph,
269
- )
270
-
271
- if parsed_result.debug_info.table_error:
272
- raise SdkUsageError(
273
- f"Failed to parse SQL query: {parsed_result.debug_info.error}"
274
- )
275
- elif parsed_result.debug_info.column_error:
276
- logger.warning(
277
- f"Failed to parse SQL query: {parsed_result.debug_info.error}",
278
- )
279
-
280
- if not parsed_result.out_tables:
281
- raise SdkUsageError(
282
- "No output tables found in the query. Cannot establish lineage."
283
- )
284
-
285
- # Use the first output table as the downstream
286
- downstream_urn = parsed_result.out_tables[0]
287
-
288
- # Process all upstream tables found in the query
289
- for upstream_table in parsed_result.in_tables:
290
- # Skip self-lineage
291
- if upstream_table == downstream_urn:
292
- continue
293
-
294
- # Extract column-level lineage for this specific upstream table
295
- column_mapping = {}
296
- if parsed_result.column_lineage:
297
- for col_lineage in parsed_result.column_lineage:
298
- if not (col_lineage.downstream and col_lineage.downstream.column):
299
- continue
300
-
301
- # Filter upstreams to only include columns from current upstream table
302
- upstream_cols = [
303
- ref.column
304
- for ref in col_lineage.upstreams
305
- if ref.table == upstream_table and ref.column
306
- ]
307
-
308
- if upstream_cols:
309
- column_mapping[col_lineage.downstream.column] = upstream_cols
310
-
311
- # Add lineage, including query text
312
- self.add_dataset_transform_lineage(
313
- upstream=upstream_table,
314
- downstream=downstream_urn,
315
- column_lineage=column_mapping or None,
316
- query_text=query_text,
317
- )
318
-
680
+ @deprecated("Use add_lineage instead")
319
681
  def add_datajob_lineage(
320
682
  self,
321
683
  *,
@@ -360,3 +722,242 @@ class LineageClient:
360
722
 
361
723
  # Apply the changes to the entity
362
724
  self._client.entities.update(patch_builder)
725
+
726
+ def get_lineage(
727
+ self,
728
+ *,
729
+ source_urn: Union[str, Urn],
730
+ source_column: Optional[str] = None,
731
+ direction: Literal["upstream", "downstream"] = "upstream",
732
+ max_hops: int = 1,
733
+ filter: Optional[Filter] = None,
734
+ count: int = 500,
735
+ ) -> List[LineageResult]:
736
+ """
737
+ Retrieve lineage entities connected to a source entity.
738
+ Args:
739
+ source_urn: Source URN for the lineage search
740
+ source_column: Source column for the lineage search
741
+ direction: Direction of lineage traversal
742
+ max_hops: Maximum number of hops to traverse
743
+ filter: Filters to apply to the lineage search
744
+ count: Maximum number of results to return
745
+
746
+ Returns:
747
+ List of lineage results
748
+
749
+ Raises:
750
+ SdkUsageError for invalid filter values
751
+ """
752
+ # Validate and convert input URN
753
+ source_urn = Urn.from_string(source_urn)
754
+ # Prepare GraphQL query variables with a separate method
755
+ variables = self._process_input_variables(
756
+ source_urn, source_column, filter, direction, max_hops, count
757
+ )
758
+
759
+ return self._execute_lineage_query(variables, direction)
760
+
761
+ def _process_input_variables(
762
+ self,
763
+ source_urn: Urn,
764
+ source_column: Optional[str] = None,
765
+ filters: Optional[Filter] = None,
766
+ direction: Literal["upstream", "downstream"] = "upstream",
767
+ max_hops: int = 1,
768
+ count: int = 500,
769
+ ) -> Dict[str, Any]:
770
+ """
771
+ Process filters and prepare GraphQL query variables for lineage search.
772
+
773
+ Args:
774
+ source_urn: Source URN for the lineage search
775
+ source_column: Source column for the lineage search
776
+ filters: Optional filters to apply
777
+ direction: Direction of lineage traversal
778
+ max_hops: Maximum number of hops to traverse
779
+ count: Maximum number of results to return
780
+
781
+ Returns:
782
+ Dictionary of GraphQL query variables
783
+
784
+ Raises:
785
+ SdkUsageError for invalid filter values
786
+ """
787
+
788
+ # print warning if max_hops is greater than 2
789
+ if max_hops > 2:
790
+ logger.warning(
791
+ """If `max_hops` is more than 2, the search will try to find the full lineage graph.
792
+ By default, only 500 results are shown.
793
+ You can change the `count` to get more or fewer results.
794
+ """
795
+ )
796
+
797
+ # Determine hop values
798
+ max_hop_values = (
799
+ [str(hop) for hop in range(1, max_hops + 1)]
800
+ if max_hops <= 2
801
+ else ["1", "2", "3+"]
802
+ )
803
+
804
+ degree_filter = FilterDsl.custom_filter(
805
+ field="degree",
806
+ condition="EQUAL",
807
+ values=max_hop_values,
808
+ )
809
+
810
+ filters_with_max_hops = (
811
+ FilterDsl.and_(degree_filter, filters)
812
+ if filters is not None
813
+ else degree_filter
814
+ )
815
+
816
+ types, compiled_filters = compile_filters(filters_with_max_hops)
817
+
818
+ # Prepare base variables
819
+ variables: Dict[str, Any] = {
820
+ "input": {
821
+ "urn": str(source_urn),
822
+ "direction": direction.upper(),
823
+ "count": count,
824
+ "types": types,
825
+ "orFilters": compiled_filters,
826
+ }
827
+ }
828
+
829
+ # if column is provided, update the variables to include the schema field urn
830
+ if isinstance(source_urn, SchemaFieldUrn) or source_column:
831
+ variables["input"]["searchFlags"] = {
832
+ "groupingSpec": {
833
+ "groupingCriteria": {
834
+ "baseEntityType": "SCHEMA_FIELD",
835
+ "groupingEntityType": "SCHEMA_FIELD",
836
+ }
837
+ }
838
+ }
839
+ if isinstance(source_urn, SchemaFieldUrn):
840
+ variables["input"]["urn"] = str(source_urn)
841
+ elif source_column:
842
+ variables["input"]["urn"] = str(SchemaFieldUrn(source_urn, source_column))
843
+
844
+ return variables
845
+
846
+ def _execute_lineage_query(
847
+ self,
848
+ variables: Dict[str, Any],
849
+ direction: Literal["upstream", "downstream"],
850
+ ) -> List[LineageResult]:
851
+ """Execute GraphQL query and process results."""
852
+ # Construct GraphQL query with dynamic path query
853
+ graphql_query = """
854
+ query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
855
+ scrollAcrossLineage(input: $input) {
856
+ nextScrollId
857
+ searchResults {
858
+ degree
859
+ entity {
860
+ urn
861
+ type
862
+ ... on Dataset {
863
+ name
864
+ platform {
865
+ name
866
+ }
867
+ properties {
868
+ description
869
+ }
870
+ }
871
+ ... on DataJob {
872
+ jobId
873
+ dataPlatformInstance {
874
+ platform {
875
+ name
876
+ }
877
+ }
878
+ properties {
879
+ name
880
+ description
881
+ }
882
+ }
883
+ }
884
+ paths {
885
+ path {
886
+ urn
887
+ type
888
+ }
889
+ }
890
+ }
891
+ }
892
+ }
893
+ """
894
+
895
+ results: List[LineageResult] = []
896
+
897
+ first_iter = True
898
+ scroll_id: Optional[str] = None
899
+
900
+ while first_iter or scroll_id:
901
+ first_iter = False
902
+
903
+ # Update scroll ID if applicable
904
+ if scroll_id:
905
+ variables["input"]["scrollId"] = scroll_id
906
+
907
+ # Execute GraphQL query
908
+ response = self._graph.execute_graphql(graphql_query, variables=variables)
909
+ data = response["scrollAcrossLineage"]
910
+ scroll_id = data.get("nextScrollId")
911
+
912
+ # Process search results
913
+ for entry in data["searchResults"]:
914
+ entity = entry["entity"]
915
+
916
+ result = self._create_lineage_result(entity, entry, direction)
917
+ results.append(result)
918
+
919
+ return results
920
+
921
+ def _create_lineage_result(
922
+ self,
923
+ entity: Dict[str, Any],
924
+ entry: Dict[str, Any],
925
+ direction: Literal["upstream", "downstream"],
926
+ ) -> LineageResult:
927
+ """Create a LineageResult from entity and entry data."""
928
+ platform = entity.get("platform", {}).get("name") or entity.get(
929
+ "dataPlatformInstance", {}
930
+ ).get("platform", {}).get("name")
931
+
932
+ result = LineageResult(
933
+ urn=entity["urn"],
934
+ type=entity["type"],
935
+ hops=entry["degree"],
936
+ direction=direction,
937
+ platform=platform,
938
+ )
939
+
940
+ properties = entity.get("properties", {})
941
+ if properties:
942
+ result.name = properties.get("name", "")
943
+ result.description = properties.get("description", "")
944
+
945
+ result.paths = []
946
+ if "paths" in entry:
947
+ # Process each path in the lineage graph
948
+ for path in entry["paths"]:
949
+ for path_entry in path["path"]:
950
+ # Only include schema fields in the path (exclude other types like Query)
951
+ if path_entry["type"] == "SCHEMA_FIELD":
952
+ schema_field_urn = SchemaFieldUrn.from_string(path_entry["urn"])
953
+ result.paths.append(
954
+ LineagePath(
955
+ urn=path_entry["urn"],
956
+ entity_name=DatasetUrn.from_string(
957
+ schema_field_urn.parent
958
+ ).name,
959
+ column_name=schema_field_urn.field_path,
960
+ )
961
+ )
962
+
963
+ return result