acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,912 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional, Tuple, Type, cast
5
+
6
+ from lark import Tree
7
+
8
+ from datahub.emitter import mce_builder as builder
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.source.powerbi.config import (
11
+ Constant,
12
+ DataBricksPlatformDetail,
13
+ DataPlatformPair,
14
+ PlatformDetail,
15
+ PowerBiDashboardSourceConfig,
16
+ PowerBiDashboardSourceReport,
17
+ PowerBIPlatformDetail,
18
+ SupportedDataPlatform,
19
+ )
20
+ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
21
+ AbstractDataPlatformInstanceResolver,
22
+ )
23
+ from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
24
+ from datahub.ingestion.source.powerbi.m_query.data_classes import (
25
+ DataAccessFunctionDetail,
26
+ DataPlatformTable,
27
+ FunctionName,
28
+ IdentifierAccessor,
29
+ Lineage,
30
+ ReferencedTable,
31
+ )
32
+ from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
+ from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def get_next_item(items: List[str], item: str) -> Optional[str]:
39
+ if item in items:
40
+ try:
41
+ index = items.index(item)
42
+ return items[index + 1]
43
+ except IndexError:
44
+ logger.debug(f'item:"{item}", not found in item-list: {items}')
45
+ return None
46
+
47
+
48
+ def urn_to_lowercase(value: str, flag: bool) -> str:
49
+ if flag is True:
50
+ return value.lower()
51
+
52
+ return value
53
+
54
+
55
+ def make_urn(
56
+ config: PowerBiDashboardSourceConfig,
57
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
58
+ data_platform_pair: DataPlatformPair,
59
+ server: str,
60
+ qualified_table_name: str,
61
+ ) -> str:
62
+ platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
63
+ PowerBIPlatformDetail(
64
+ data_platform_pair=data_platform_pair,
65
+ data_platform_server=server,
66
+ )
67
+ )
68
+
69
+ return builder.make_dataset_urn_with_platform_instance(
70
+ platform=data_platform_pair.datahub_data_platform_name,
71
+ platform_instance=platform_detail.platform_instance,
72
+ env=platform_detail.env,
73
+ name=urn_to_lowercase(
74
+ qualified_table_name, config.convert_lineage_urns_to_lowercase
75
+ ),
76
+ )
77
+
78
+
79
+ class AbstractLineage(ABC):
80
+ """
81
+ Base class to share common functionalities among different dataplatform for M-Query parsing.
82
+
83
+ To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
84
+ the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query.
85
+
86
+ let
87
+ Source = Sql.Database("localhost", "library"),
88
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
89
+ in
90
+ dbo_book_issue
91
+
92
+ It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL.
93
+
94
+ DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
95
+
96
+ data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
97
+ find out database-name , schema-name and table-name also varies as per dataplatform.
98
+
99
+ Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query
100
+
101
+ let
102
+ Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
103
+ in
104
+ Source
105
+
106
+ In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
107
+
108
+ NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
109
+
110
+ """
111
+
112
+ ctx: PipelineContext
113
+ table: Table
114
+ config: PowerBiDashboardSourceConfig
115
+ reporter: PowerBiDashboardSourceReport
116
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver
117
+
118
+ def __init__(
119
+ self,
120
+ ctx: PipelineContext,
121
+ table: Table,
122
+ config: PowerBiDashboardSourceConfig,
123
+ reporter: PowerBiDashboardSourceReport,
124
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
125
+ ) -> None:
126
+ super().__init__()
127
+ self.ctx = ctx
128
+ self.table = table
129
+ self.config = config
130
+ self.reporter = reporter
131
+ self.platform_instance_resolver = platform_instance_resolver
132
+
133
+ @abstractmethod
134
+ def create_lineage(
135
+ self, data_access_func_detail: DataAccessFunctionDetail
136
+ ) -> Lineage:
137
+ pass
138
+
139
+ @abstractmethod
140
+ def get_platform_pair(self) -> DataPlatformPair:
141
+ pass
142
+
143
+ @staticmethod
144
+ def get_db_detail_from_argument(
145
+ arg_list: Tree,
146
+ ) -> Tuple[Optional[str], Optional[str]]:
147
+ arguments: List[str] = tree_function.strip_char_from_list(
148
+ values=tree_function.remove_whitespaces_from_list(
149
+ tree_function.token_values(arg_list)
150
+ ),
151
+ )
152
+
153
+ if len(arguments) < 2:
154
+ logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
155
+ return None, None
156
+
157
+ return arguments[0], arguments[1]
158
+
159
+ @staticmethod
160
+ def create_reference_table(
161
+ arg_list: Tree,
162
+ table_detail: Dict[str, str],
163
+ ) -> Optional[ReferencedTable]:
164
+ arguments: List[str] = tree_function.strip_char_from_list(
165
+ values=tree_function.remove_whitespaces_from_list(
166
+ tree_function.token_values(arg_list)
167
+ ),
168
+ )
169
+
170
+ logger.debug(f"Processing arguments {arguments}")
171
+
172
+ if (
173
+ len(arguments)
174
+ >= 4 # [0] is warehouse FQDN.
175
+ # [1] is endpoint, we are not using it.
176
+ # [2] is "Catalog" key
177
+ # [3] is catalog's value
178
+ ):
179
+ return ReferencedTable(
180
+ warehouse=arguments[0],
181
+ catalog=arguments[3],
182
+ # As per my observation, database and catalog names are same in M-Query
183
+ database=table_detail["Database"]
184
+ if table_detail.get("Database")
185
+ else arguments[3],
186
+ schema=table_detail["Schema"],
187
+ table=table_detail.get("Table") or table_detail["View"],
188
+ )
189
+ elif len(arguments) == 2:
190
+ return ReferencedTable(
191
+ warehouse=arguments[0],
192
+ database=table_detail["Database"],
193
+ schema=table_detail["Schema"],
194
+ table=table_detail.get("Table") or table_detail["View"],
195
+ catalog=None,
196
+ )
197
+
198
+ return None
199
+
200
+ def parse_custom_sql(
201
+ self, query: str, server: str, database: Optional[str], schema: Optional[str]
202
+ ) -> Lineage:
203
+ dataplatform_tables: List[DataPlatformTable] = []
204
+
205
+ platform_detail: PlatformDetail = (
206
+ self.platform_instance_resolver.get_platform_instance(
207
+ PowerBIPlatformDetail(
208
+ data_platform_pair=self.get_platform_pair(),
209
+ data_platform_server=server,
210
+ )
211
+ )
212
+ )
213
+
214
+ query = native_sql_parser.remove_drop_statement(
215
+ native_sql_parser.remove_special_characters(query)
216
+ )
217
+
218
+ parsed_result: Optional[
219
+ "SqlParsingResult"
220
+ ] = native_sql_parser.parse_custom_sql(
221
+ ctx=self.ctx,
222
+ query=query,
223
+ platform=self.get_platform_pair().datahub_data_platform_name,
224
+ platform_instance=platform_detail.platform_instance,
225
+ env=platform_detail.env,
226
+ database=database,
227
+ schema=schema,
228
+ )
229
+
230
+ if parsed_result is None:
231
+ self.reporter.info(
232
+ title=Constant.SQL_PARSING_FAILURE,
233
+ message="Fail to parse native sql present in PowerBI M-Query",
234
+ context=f"table-name={self.table.full_name}, sql={query}",
235
+ )
236
+ return Lineage.empty()
237
+
238
+ if parsed_result.debug_info and parsed_result.debug_info.table_error:
239
+ self.reporter.warning(
240
+ title=Constant.SQL_PARSING_FAILURE,
241
+ message="Fail to parse native sql present in PowerBI M-Query",
242
+ context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
243
+ )
244
+ return Lineage.empty()
245
+
246
+ for urn in parsed_result.in_tables:
247
+ dataplatform_tables.append(
248
+ DataPlatformTable(
249
+ data_platform_pair=self.get_platform_pair(),
250
+ urn=urn,
251
+ )
252
+ )
253
+
254
+ logger.debug(f"Native Query parsed result={parsed_result}")
255
+ logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
256
+
257
+ return Lineage(
258
+ upstreams=dataplatform_tables,
259
+ column_lineage=(
260
+ parsed_result.column_lineage
261
+ if parsed_result.column_lineage is not None
262
+ else []
263
+ ),
264
+ )
265
+
266
+
267
+ class AmazonRedshiftLineage(AbstractLineage):
268
+ def get_platform_pair(self) -> DataPlatformPair:
269
+ return SupportedDataPlatform.AMAZON_REDSHIFT.value
270
+
271
+ def create_lineage(
272
+ self, data_access_func_detail: DataAccessFunctionDetail
273
+ ) -> Lineage:
274
+ logger.debug(
275
+ f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
276
+ )
277
+
278
+ server, db_name = self.get_db_detail_from_argument(
279
+ data_access_func_detail.arg_list
280
+ )
281
+ if db_name is None or server is None:
282
+ return Lineage.empty() # Return an empty list
283
+
284
+ schema_name: str = cast(
285
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
286
+ ).items["Name"]
287
+
288
+ table_name: str = cast(
289
+ IdentifierAccessor,
290
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
291
+ ).items["Name"]
292
+
293
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
294
+
295
+ urn = make_urn(
296
+ config=self.config,
297
+ platform_instance_resolver=self.platform_instance_resolver,
298
+ data_platform_pair=self.get_platform_pair(),
299
+ server=server,
300
+ qualified_table_name=qualified_table_name,
301
+ )
302
+
303
+ return Lineage(
304
+ upstreams=[
305
+ DataPlatformTable(
306
+ data_platform_pair=self.get_platform_pair(),
307
+ urn=urn,
308
+ )
309
+ ],
310
+ column_lineage=[],
311
+ )
312
+
313
+
314
+ class OracleLineage(AbstractLineage):
315
+ def get_platform_pair(self) -> DataPlatformPair:
316
+ return SupportedDataPlatform.ORACLE.value
317
+
318
+ @staticmethod
319
+ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
320
+ error_message: str = (
321
+ f"The target argument ({value}) should in the format of <host-name>:<port>/<db-name>["
322
+ ".<domain>]"
323
+ )
324
+ splitter_result: List[str] = value.split("/")
325
+ if len(splitter_result) != 2:
326
+ logger.debug(error_message)
327
+ return None, None
328
+
329
+ db_name = splitter_result[1].split(".")[0]
330
+
331
+ return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
332
+
333
+ def create_lineage(
334
+ self, data_access_func_detail: DataAccessFunctionDetail
335
+ ) -> Lineage:
336
+ logger.debug(
337
+ f"Processing Oracle data-access function detail {data_access_func_detail}"
338
+ )
339
+
340
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
341
+ tree_function.token_values(data_access_func_detail.arg_list)
342
+ )
343
+
344
+ server, db_name = self._get_server_and_db_name(arguments[0])
345
+
346
+ if db_name is None or server is None:
347
+ return Lineage.empty()
348
+
349
+ schema_name: str = cast(
350
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
351
+ ).items["Schema"]
352
+
353
+ table_name: str = cast(
354
+ IdentifierAccessor,
355
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
356
+ ).items["Name"]
357
+
358
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
359
+
360
+ urn = make_urn(
361
+ config=self.config,
362
+ platform_instance_resolver=self.platform_instance_resolver,
363
+ data_platform_pair=self.get_platform_pair(),
364
+ server=server,
365
+ qualified_table_name=qualified_table_name,
366
+ )
367
+
368
+ return Lineage(
369
+ upstreams=[
370
+ DataPlatformTable(
371
+ data_platform_pair=self.get_platform_pair(),
372
+ urn=urn,
373
+ )
374
+ ],
375
+ column_lineage=[],
376
+ )
377
+
378
+
379
+ class DatabricksLineage(AbstractLineage):
380
+ def form_qualified_table_name(
381
+ self,
382
+ table_reference: ReferencedTable,
383
+ data_platform_pair: DataPlatformPair,
384
+ ) -> str:
385
+ platform_detail: PlatformDetail = (
386
+ self.platform_instance_resolver.get_platform_instance(
387
+ PowerBIPlatformDetail(
388
+ data_platform_pair=data_platform_pair,
389
+ data_platform_server=table_reference.warehouse,
390
+ )
391
+ )
392
+ )
393
+
394
+ metastore: Optional[str] = None
395
+
396
+ qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
397
+
398
+ if isinstance(platform_detail, DataBricksPlatformDetail):
399
+ metastore = platform_detail.metastore
400
+
401
+ if metastore is not None:
402
+ return f"{metastore}.{qualified_table_name}"
403
+
404
+ return qualified_table_name
405
+
406
+ def create_lineage(
407
+ self, data_access_func_detail: DataAccessFunctionDetail
408
+ ) -> Lineage:
409
+ logger.debug(
410
+ f"Processing Databrick data-access function detail {data_access_func_detail}"
411
+ )
412
+ table_detail: Dict[str, str] = {}
413
+ temp_accessor: Optional[
414
+ IdentifierAccessor
415
+ ] = data_access_func_detail.identifier_accessor
416
+
417
+ while temp_accessor:
418
+ # Condition to handle databricks M-query pattern where table, schema and database all are present in
419
+ # the same invoke statement
420
+ if all(
421
+ element in temp_accessor.items
422
+ for element in ["Item", "Schema", "Catalog"]
423
+ ):
424
+ table_detail["Schema"] = temp_accessor.items["Schema"]
425
+ table_detail["Table"] = temp_accessor.items["Item"]
426
+ else:
427
+ table_detail[temp_accessor.items["Kind"]] = temp_accessor.items["Name"]
428
+
429
+ if temp_accessor.next is not None:
430
+ temp_accessor = temp_accessor.next
431
+ else:
432
+ break
433
+
434
+ table_reference = self.create_reference_table(
435
+ arg_list=data_access_func_detail.arg_list,
436
+ table_detail=table_detail,
437
+ )
438
+
439
+ if table_reference:
440
+ qualified_table_name: str = self.form_qualified_table_name(
441
+ table_reference=table_reference,
442
+ data_platform_pair=self.get_platform_pair(),
443
+ )
444
+
445
+ urn = make_urn(
446
+ config=self.config,
447
+ platform_instance_resolver=self.platform_instance_resolver,
448
+ data_platform_pair=self.get_platform_pair(),
449
+ server=table_reference.warehouse,
450
+ qualified_table_name=qualified_table_name,
451
+ )
452
+
453
+ return Lineage(
454
+ upstreams=[
455
+ DataPlatformTable(
456
+ data_platform_pair=self.get_platform_pair(),
457
+ urn=urn,
458
+ )
459
+ ],
460
+ column_lineage=[],
461
+ )
462
+
463
+ return Lineage.empty()
464
+
465
+ def get_platform_pair(self) -> DataPlatformPair:
466
+ return SupportedDataPlatform.DATABRICKS_SQL.value
467
+
468
+
469
+ class TwoStepDataAccessPattern(AbstractLineage, ABC):
470
+ """
471
+ These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern
472
+ let
473
+ Source = Sql.Database("localhost", "library"),
474
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
475
+ in
476
+ dbo_book_issue
477
+ """
478
+
479
+ def two_level_access_pattern(
480
+ self, data_access_func_detail: DataAccessFunctionDetail
481
+ ) -> Lineage:
482
+ logger.debug(
483
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
484
+ )
485
+
486
+ server, db_name = self.get_db_detail_from_argument(
487
+ data_access_func_detail.arg_list
488
+ )
489
+ if server is None or db_name is None:
490
+ return Lineage.empty() # Return an empty list
491
+
492
+ schema_name: str = cast(
493
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
494
+ ).items["Schema"]
495
+
496
+ table_name: str = cast(
497
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
498
+ ).items["Item"]
499
+
500
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
501
+
502
+ logger.debug(
503
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
504
+ )
505
+
506
+ urn = make_urn(
507
+ config=self.config,
508
+ platform_instance_resolver=self.platform_instance_resolver,
509
+ data_platform_pair=self.get_platform_pair(),
510
+ server=server,
511
+ qualified_table_name=qualified_table_name,
512
+ )
513
+ return Lineage(
514
+ upstreams=[
515
+ DataPlatformTable(
516
+ data_platform_pair=self.get_platform_pair(),
517
+ urn=urn,
518
+ )
519
+ ],
520
+ column_lineage=[],
521
+ )
522
+
523
+
524
+ class PostgresLineage(TwoStepDataAccessPattern):
525
+ def create_lineage(
526
+ self, data_access_func_detail: DataAccessFunctionDetail
527
+ ) -> Lineage:
528
+ return self.two_level_access_pattern(data_access_func_detail)
529
+
530
+ def get_platform_pair(self) -> DataPlatformPair:
531
+ return SupportedDataPlatform.POSTGRES_SQL.value
532
+
533
+
534
+ class MSSqlLineage(TwoStepDataAccessPattern):
535
+ # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
536
+ DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
537
+
538
+ def get_platform_pair(self) -> DataPlatformPair:
539
+ return SupportedDataPlatform.MS_SQL.value
540
+
541
+ def create_urn_using_old_parser(
542
+ self, query: str, db_name: str, server: str
543
+ ) -> List[DataPlatformTable]:
544
+ dataplatform_tables: List[DataPlatformTable] = []
545
+
546
+ tables: List[str] = native_sql_parser.get_tables(query)
547
+
548
+ for parsed_table in tables:
549
+ # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
550
+ components = [v.strip("[]") for v in parsed_table.split(".")]
551
+ if len(components) == 3:
552
+ database, schema, table = components
553
+ elif len(components) == 2:
554
+ schema, table = components
555
+ database = db_name
556
+ elif len(components) == 1:
557
+ (table,) = components
558
+ database = db_name
559
+ schema = MSSqlLineage.DEFAULT_SCHEMA
560
+ else:
561
+ self.reporter.warning(
562
+ title="Invalid table format",
563
+ message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as <db-name>.<schema-name>.<table-name> in the SQL.",
564
+ context=f"table-name={self.table.full_name}",
565
+ )
566
+ continue
567
+
568
+ qualified_table_name = f"{database}.{schema}.{table}"
569
+ urn = make_urn(
570
+ config=self.config,
571
+ platform_instance_resolver=self.platform_instance_resolver,
572
+ data_platform_pair=self.get_platform_pair(),
573
+ server=server,
574
+ qualified_table_name=qualified_table_name,
575
+ )
576
+ dataplatform_tables.append(
577
+ DataPlatformTable(
578
+ data_platform_pair=self.get_platform_pair(),
579
+ urn=urn,
580
+ )
581
+ )
582
+
583
+ logger.debug(f"Generated upstream tables = {dataplatform_tables}")
584
+
585
+ return dataplatform_tables
586
+
587
+ def create_lineage(
588
+ self, data_access_func_detail: DataAccessFunctionDetail
589
+ ) -> Lineage:
590
+ arguments: List[str] = tree_function.strip_char_from_list(
591
+ values=tree_function.remove_whitespaces_from_list(
592
+ tree_function.token_values(data_access_func_detail.arg_list)
593
+ ),
594
+ )
595
+
596
+ server, database = self.get_db_detail_from_argument(
597
+ data_access_func_detail.arg_list
598
+ )
599
+ if server is None or database is None:
600
+ return Lineage.empty() # Return an empty list
601
+
602
+ assert server
603
+ assert database # to silent the lint
604
+
605
+ query: Optional[str] = get_next_item(arguments, "Query")
606
+ if query:
607
+ if self.config.enable_advance_lineage_sql_construct is False:
608
+ # Use previous parser to generate URN to keep backward compatibility
609
+ return Lineage(
610
+ upstreams=self.create_urn_using_old_parser(
611
+ query=query,
612
+ db_name=database,
613
+ server=server,
614
+ ),
615
+ column_lineage=[],
616
+ )
617
+
618
+ return self.parse_custom_sql(
619
+ query=query,
620
+ database=database,
621
+ server=server,
622
+ schema=MSSqlLineage.DEFAULT_SCHEMA,
623
+ )
624
+
625
+ # It is a regular case of MS-SQL
626
+ logger.debug("Handling with regular case")
627
+ return self.two_level_access_pattern(data_access_func_detail)
628
+
629
+
630
+ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
631
+ def get_datasource_server(
632
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
633
+ ) -> str:
634
+ return tree_function.strip_char_from_list([arguments[0]])[0]
635
+
636
+ def create_lineage(
637
+ self, data_access_func_detail: DataAccessFunctionDetail
638
+ ) -> Lineage:
639
+ logger.debug(
640
+ f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
641
+ )
642
+
643
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
644
+ tree_function.token_values(data_access_func_detail.arg_list)
645
+ )
646
+ # First is database name
647
+ db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
648
+ # Second is schema name
649
+ schema_name: str = cast(
650
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
651
+ ).items["Name"]
652
+ # Third is table name
653
+ table_name: str = cast(
654
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
655
+ ).items["Name"]
656
+
657
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
658
+
659
+ logger.debug(
660
+ f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
661
+ )
662
+
663
+ server: str = self.get_datasource_server(arguments, data_access_func_detail)
664
+
665
+ urn = make_urn(
666
+ config=self.config,
667
+ platform_instance_resolver=self.platform_instance_resolver,
668
+ data_platform_pair=self.get_platform_pair(),
669
+ server=server,
670
+ qualified_table_name=qualified_table_name,
671
+ )
672
+
673
+ return Lineage(
674
+ upstreams=[
675
+ DataPlatformTable(
676
+ data_platform_pair=self.get_platform_pair(),
677
+ urn=urn,
678
+ )
679
+ ],
680
+ column_lineage=[],
681
+ )
682
+
683
+
684
+ class SnowflakeLineage(ThreeStepDataAccessPattern):
685
+ def get_platform_pair(self) -> DataPlatformPair:
686
+ return SupportedDataPlatform.SNOWFLAKE.value
687
+
688
+
689
+ class GoogleBigQueryLineage(ThreeStepDataAccessPattern):
690
+ def get_platform_pair(self) -> DataPlatformPair:
691
+ return SupportedDataPlatform.GOOGLE_BIGQUERY.value
692
+
693
+ def get_datasource_server(
694
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
695
+ ) -> str:
696
+ # In Google BigQuery server is project-name
697
+ # condition to silent lint, it is not going to be None
698
+ return (
699
+ data_access_func_detail.identifier_accessor.items["Name"]
700
+ if data_access_func_detail.identifier_accessor is not None
701
+ else ""
702
+ )
703
+
704
+
705
+ class NativeQueryLineage(AbstractLineage):
706
+ SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
707
+ SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
708
+ SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
709
+ SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
710
+ }
711
+ current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
712
+
713
+ def get_platform_pair(self) -> DataPlatformPair:
714
+ return self.current_data_platform.value
715
+
716
+ @staticmethod
717
+ def is_native_parsing_supported(data_access_function_name: str) -> bool:
718
+ return (
719
+ data_access_function_name
720
+ in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
721
+ )
722
+
723
+ def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
724
+ dataplatform_tables: List[DataPlatformTable] = []
725
+
726
+ tables: List[str] = native_sql_parser.get_tables(query)
727
+
728
+ for qualified_table_name in tables:
729
+ if len(qualified_table_name.split(".")) != 3:
730
+ logger.debug(
731
+ f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
732
+ )
733
+ continue
734
+
735
+ urn = make_urn(
736
+ config=self.config,
737
+ platform_instance_resolver=self.platform_instance_resolver,
738
+ data_platform_pair=self.get_platform_pair(),
739
+ server=server,
740
+ qualified_table_name=qualified_table_name,
741
+ )
742
+
743
+ dataplatform_tables.append(
744
+ DataPlatformTable(
745
+ data_platform_pair=self.get_platform_pair(),
746
+ urn=urn,
747
+ )
748
+ )
749
+
750
+ logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
751
+
752
+ return Lineage(
753
+ upstreams=dataplatform_tables,
754
+ column_lineage=[],
755
+ )
756
+
757
+ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
758
+ if (
759
+ data_access_tokens[0]
760
+ != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
761
+ ):
762
+ return None
763
+
764
+ database: Optional[str] = get_next_item(data_access_tokens, "Database")
765
+
766
+ if (
767
+ database and database != Constant.M_QUERY_NULL
768
+ ): # database name is explicitly set
769
+ return database
770
+
771
+ return get_next_item( # database name is set in Name argument
772
+ data_access_tokens, "Name"
773
+ ) or get_next_item( # If both above arguments are not available, then try Catalog
774
+ data_access_tokens, "Catalog"
775
+ )
776
+
777
+ def create_lineage(
778
+ self, data_access_func_detail: DataAccessFunctionDetail
779
+ ) -> Lineage:
780
+ t1: Optional[Tree] = tree_function.first_arg_list_func(
781
+ data_access_func_detail.arg_list
782
+ )
783
+ assert t1 is not None
784
+ flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
785
+
786
+ if len(flat_argument_list) != 2:
787
+ logger.debug(
788
+ f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
789
+ )
790
+ logger.debug(f"Flat argument list = {flat_argument_list}")
791
+ return Lineage.empty()
792
+
793
+ data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
794
+ tree_function.token_values(flat_argument_list[0])
795
+ )
796
+
797
+ if not self.is_native_parsing_supported(data_access_tokens[0]):
798
+ logger.debug(
799
+ f"Unsupported native-query data-platform = {data_access_tokens[0]}"
800
+ )
801
+ logger.debug(
802
+ f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
803
+ )
804
+
805
+ return Lineage.empty()
806
+
807
+ if len(data_access_tokens[0]) < 3:
808
+ logger.debug(
809
+ f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
810
+ "list"
811
+ )
812
+ return Lineage.empty()
813
+
814
+ self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
815
+ data_access_tokens[0]
816
+ ]
817
+ # The First argument is the query
818
+ sql_query: str = tree_function.strip_char_from_list(
819
+ values=tree_function.remove_whitespaces_from_list(
820
+ tree_function.token_values(flat_argument_list[1])
821
+ ),
822
+ )[
823
+ 0
824
+ ] # Remove any whitespaces and double quotes character
825
+
826
+ server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
827
+
828
+ if self.config.enable_advance_lineage_sql_construct is False:
829
+ # Use previous parser to generate URN to keep backward compatibility
830
+ return self.create_urn_using_old_parser(
831
+ query=sql_query,
832
+ server=server,
833
+ )
834
+
835
+ database_name: Optional[str] = self.get_db_name(data_access_tokens)
836
+
837
+ return self.parse_custom_sql(
838
+ query=sql_query,
839
+ server=server,
840
+ database=database_name,
841
+ schema=None,
842
+ )
843
+
844
+
845
+ class SupportedPattern(Enum):
846
+ DATABRICKS_QUERY = (
847
+ DatabricksLineage,
848
+ FunctionName.DATABRICK_DATA_ACCESS,
849
+ )
850
+
851
+ DATABRICKS_MULTI_CLOUD = (
852
+ DatabricksLineage,
853
+ FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
854
+ )
855
+
856
+ POSTGRES_SQL = (
857
+ PostgresLineage,
858
+ FunctionName.POSTGRESQL_DATA_ACCESS,
859
+ )
860
+
861
+ ORACLE = (
862
+ OracleLineage,
863
+ FunctionName.ORACLE_DATA_ACCESS,
864
+ )
865
+
866
+ SNOWFLAKE = (
867
+ SnowflakeLineage,
868
+ FunctionName.SNOWFLAKE_DATA_ACCESS,
869
+ )
870
+
871
+ MS_SQL = (
872
+ MSSqlLineage,
873
+ FunctionName.MSSQL_DATA_ACCESS,
874
+ )
875
+
876
+ GOOGLE_BIG_QUERY = (
877
+ GoogleBigQueryLineage,
878
+ FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
879
+ )
880
+
881
+ AMAZON_REDSHIFT = (
882
+ AmazonRedshiftLineage,
883
+ FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
884
+ )
885
+
886
+ NATIVE_QUERY = (
887
+ NativeQueryLineage,
888
+ FunctionName.NATIVE_QUERY,
889
+ )
890
+
891
+ def handler(self) -> Type[AbstractLineage]:
892
+ return self.value[0]
893
+
894
+ def function_name(self) -> str:
895
+ return self.value[1].value
896
+
897
+ @staticmethod
898
+ def get_function_names() -> List[str]:
899
+ functions: List[str] = []
900
+ for supported_resolver in SupportedPattern:
901
+ functions.append(supported_resolver.function_name())
902
+
903
+ return functions
904
+
905
+ @staticmethod
906
+ def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]:
907
+ logger.debug(f"Looking for pattern-handler for {function_name}")
908
+ for supported_resolver in SupportedPattern:
909
+ if function_name == supported_resolver.function_name():
910
+ return supported_resolver
911
+ logger.debug(f"pattern-handler not found for function_name {function_name}")
912
+ return None