acryl-datahub 0.15.0rc3__py3-none-any.whl → 0.15.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -0,0 +1,920 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional, Tuple, Type, Union, cast
5
+
6
+ from lark import Tree
7
+
8
+ from datahub.emitter import mce_builder as builder
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.source.powerbi.config import (
11
+ Constant,
12
+ DataBricksPlatformDetail,
13
+ DataPlatformPair,
14
+ PlatformDetail,
15
+ PowerBiDashboardSourceConfig,
16
+ PowerBiDashboardSourceReport,
17
+ PowerBIPlatformDetail,
18
+ SupportedDataPlatform,
19
+ )
20
+ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
21
+ AbstractDataPlatformInstanceResolver,
22
+ )
23
+ from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
24
+ from datahub.ingestion.source.powerbi.m_query.data_classes import (
25
+ AbstractIdentifierAccessor,
26
+ DataAccessFunctionDetail,
27
+ DataPlatformTable,
28
+ FunctionName,
29
+ IdentifierAccessor,
30
+ Lineage,
31
+ ReferencedTable,
32
+ )
33
+ from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
34
+ from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ def get_next_item(items: List[str], item: str) -> Optional[str]:
40
+ if item in items:
41
+ try:
42
+ index = items.index(item)
43
+ return items[index + 1]
44
+ except IndexError:
45
+ logger.debug(f'item:"{item}", not found in item-list: {items}')
46
+ return None
47
+
48
+
49
+ def urn_to_lowercase(value: str, flag: bool) -> str:
50
+ if flag is True:
51
+ return value.lower()
52
+
53
+ return value
54
+
55
+
56
+ def make_urn(
57
+ config: PowerBiDashboardSourceConfig,
58
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
59
+ data_platform_pair: DataPlatformPair,
60
+ server: str,
61
+ qualified_table_name: str,
62
+ ) -> str:
63
+ platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
64
+ PowerBIPlatformDetail(
65
+ data_platform_pair=data_platform_pair,
66
+ data_platform_server=server,
67
+ )
68
+ )
69
+
70
+ return builder.make_dataset_urn_with_platform_instance(
71
+ platform=data_platform_pair.datahub_data_platform_name,
72
+ platform_instance=platform_detail.platform_instance,
73
+ env=platform_detail.env,
74
+ name=urn_to_lowercase(
75
+ qualified_table_name, config.convert_lineage_urns_to_lowercase
76
+ ),
77
+ )
78
+
79
+
80
+ class AbstractLineage(ABC):
81
+ """
82
+ Base class to share common functionalities among different dataplatform for M-Query parsing.
83
+
84
+ To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
85
+ the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query.
86
+
87
+ let
88
+ Source = Sql.Database("localhost", "library"),
89
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
90
+ in
91
+ dbo_book_issue
92
+
93
+ It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL.
94
+
95
+ DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
96
+
97
+ data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
98
+ find out database-name , schema-name and table-name also varies as per dataplatform.
99
+
100
+ Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query
101
+
102
+ let
103
+ Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
104
+ in
105
+ Source
106
+
107
+ In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
108
+
109
+ NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
110
+
111
+ """
112
+
113
+ ctx: PipelineContext
114
+ table: Table
115
+ config: PowerBiDashboardSourceConfig
116
+ reporter: PowerBiDashboardSourceReport
117
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver
118
+
119
+ def __init__(
120
+ self,
121
+ ctx: PipelineContext,
122
+ table: Table,
123
+ config: PowerBiDashboardSourceConfig,
124
+ reporter: PowerBiDashboardSourceReport,
125
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
126
+ ) -> None:
127
+ super().__init__()
128
+ self.ctx = ctx
129
+ self.table = table
130
+ self.config = config
131
+ self.reporter = reporter
132
+ self.platform_instance_resolver = platform_instance_resolver
133
+
134
+ @abstractmethod
135
+ def create_lineage(
136
+ self, data_access_func_detail: DataAccessFunctionDetail
137
+ ) -> Lineage:
138
+ pass
139
+
140
+ @abstractmethod
141
+ def get_platform_pair(self) -> DataPlatformPair:
142
+ pass
143
+
144
+ @staticmethod
145
+ def get_db_detail_from_argument(
146
+ arg_list: Tree,
147
+ ) -> Tuple[Optional[str], Optional[str]]:
148
+ arguments: List[str] = tree_function.strip_char_from_list(
149
+ values=tree_function.remove_whitespaces_from_list(
150
+ tree_function.token_values(arg_list)
151
+ ),
152
+ )
153
+
154
+ if len(arguments) < 2:
155
+ logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
156
+ return None, None
157
+
158
+ return arguments[0], arguments[1]
159
+
160
+ @staticmethod
161
+ def create_reference_table(
162
+ arg_list: Tree,
163
+ table_detail: Dict[str, str],
164
+ ) -> Optional[ReferencedTable]:
165
+ arguments: List[str] = tree_function.strip_char_from_list(
166
+ values=tree_function.remove_whitespaces_from_list(
167
+ tree_function.token_values(arg_list)
168
+ ),
169
+ )
170
+
171
+ logger.debug(f"Processing arguments {arguments}")
172
+
173
+ if (
174
+ len(arguments)
175
+ >= 4 # [0] is warehouse FQDN.
176
+ # [1] is endpoint, we are not using it.
177
+ # [2] is "Catalog" key
178
+ # [3] is catalog's value
179
+ ):
180
+ return ReferencedTable(
181
+ warehouse=arguments[0],
182
+ catalog=arguments[3],
183
+ # As per my observation, database and catalog names are same in M-Query
184
+ database=table_detail["Database"]
185
+ if table_detail.get("Database")
186
+ else arguments[3],
187
+ schema=table_detail["Schema"],
188
+ table=table_detail.get("Table") or table_detail["View"],
189
+ )
190
+ elif len(arguments) == 2:
191
+ return ReferencedTable(
192
+ warehouse=arguments[0],
193
+ database=table_detail["Database"],
194
+ schema=table_detail["Schema"],
195
+ table=table_detail.get("Table") or table_detail["View"],
196
+ catalog=None,
197
+ )
198
+
199
+ return None
200
+
201
+ def parse_custom_sql(
202
+ self, query: str, server: str, database: Optional[str], schema: Optional[str]
203
+ ) -> Lineage:
204
+ dataplatform_tables: List[DataPlatformTable] = []
205
+
206
+ platform_detail: PlatformDetail = (
207
+ self.platform_instance_resolver.get_platform_instance(
208
+ PowerBIPlatformDetail(
209
+ data_platform_pair=self.get_platform_pair(),
210
+ data_platform_server=server,
211
+ )
212
+ )
213
+ )
214
+
215
+ query = native_sql_parser.remove_drop_statement(
216
+ native_sql_parser.remove_special_characters(query)
217
+ )
218
+
219
+ parsed_result: Optional[
220
+ "SqlParsingResult"
221
+ ] = native_sql_parser.parse_custom_sql(
222
+ ctx=self.ctx,
223
+ query=query,
224
+ platform=self.get_platform_pair().datahub_data_platform_name,
225
+ platform_instance=platform_detail.platform_instance,
226
+ env=platform_detail.env,
227
+ database=database,
228
+ schema=schema,
229
+ )
230
+
231
+ if parsed_result is None:
232
+ self.reporter.info(
233
+ title=Constant.SQL_PARSING_FAILURE,
234
+ message="Fail to parse native sql present in PowerBI M-Query",
235
+ context=f"table-name={self.table.full_name}, sql={query}",
236
+ )
237
+ return Lineage.empty()
238
+
239
+ if parsed_result.debug_info and parsed_result.debug_info.table_error:
240
+ self.reporter.warning(
241
+ title=Constant.SQL_PARSING_FAILURE,
242
+ message="Fail to parse native sql present in PowerBI M-Query",
243
+ context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
244
+ )
245
+ return Lineage.empty()
246
+
247
+ for urn in parsed_result.in_tables:
248
+ dataplatform_tables.append(
249
+ DataPlatformTable(
250
+ data_platform_pair=self.get_platform_pair(),
251
+ urn=urn,
252
+ )
253
+ )
254
+
255
+ logger.debug(f"Native Query parsed result={parsed_result}")
256
+ logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
257
+
258
+ return Lineage(
259
+ upstreams=dataplatform_tables,
260
+ column_lineage=(
261
+ parsed_result.column_lineage
262
+ if parsed_result.column_lineage is not None
263
+ else []
264
+ ),
265
+ )
266
+
267
+
268
+ class AmazonRedshiftLineage(AbstractLineage):
269
+ def get_platform_pair(self) -> DataPlatformPair:
270
+ return SupportedDataPlatform.AMAZON_REDSHIFT.value
271
+
272
+ def create_lineage(
273
+ self, data_access_func_detail: DataAccessFunctionDetail
274
+ ) -> Lineage:
275
+ logger.debug(
276
+ f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
277
+ )
278
+
279
+ server, db_name = self.get_db_detail_from_argument(
280
+ data_access_func_detail.arg_list
281
+ )
282
+ if db_name is None or server is None:
283
+ return Lineage.empty() # Return an empty list
284
+
285
+ schema_name: str = cast(
286
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
287
+ ).items["Name"]
288
+
289
+ table_name: str = cast(
290
+ IdentifierAccessor,
291
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
292
+ ).items["Name"]
293
+
294
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
295
+
296
+ urn = make_urn(
297
+ config=self.config,
298
+ platform_instance_resolver=self.platform_instance_resolver,
299
+ data_platform_pair=self.get_platform_pair(),
300
+ server=server,
301
+ qualified_table_name=qualified_table_name,
302
+ )
303
+
304
+ return Lineage(
305
+ upstreams=[
306
+ DataPlatformTable(
307
+ data_platform_pair=self.get_platform_pair(),
308
+ urn=urn,
309
+ )
310
+ ],
311
+ column_lineage=[],
312
+ )
313
+
314
+
315
+ class OracleLineage(AbstractLineage):
316
+ def get_platform_pair(self) -> DataPlatformPair:
317
+ return SupportedDataPlatform.ORACLE.value
318
+
319
+ @staticmethod
320
+ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
321
+ error_message: str = (
322
+ f"The target argument ({value}) should in the format of <host-name>:<port>/<db-name>["
323
+ ".<domain>]"
324
+ )
325
+ splitter_result: List[str] = value.split("/")
326
+ if len(splitter_result) != 2:
327
+ logger.debug(error_message)
328
+ return None, None
329
+
330
+ db_name = splitter_result[1].split(".")[0]
331
+
332
+ return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
333
+
334
+ def create_lineage(
335
+ self, data_access_func_detail: DataAccessFunctionDetail
336
+ ) -> Lineage:
337
+ logger.debug(
338
+ f"Processing Oracle data-access function detail {data_access_func_detail}"
339
+ )
340
+
341
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
342
+ tree_function.token_values(data_access_func_detail.arg_list)
343
+ )
344
+
345
+ server, db_name = self._get_server_and_db_name(arguments[0])
346
+
347
+ if db_name is None or server is None:
348
+ return Lineage.empty()
349
+
350
+ schema_name: str = cast(
351
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
352
+ ).items["Schema"]
353
+
354
+ table_name: str = cast(
355
+ IdentifierAccessor,
356
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
357
+ ).items["Name"]
358
+
359
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
360
+
361
+ urn = make_urn(
362
+ config=self.config,
363
+ platform_instance_resolver=self.platform_instance_resolver,
364
+ data_platform_pair=self.get_platform_pair(),
365
+ server=server,
366
+ qualified_table_name=qualified_table_name,
367
+ )
368
+
369
+ return Lineage(
370
+ upstreams=[
371
+ DataPlatformTable(
372
+ data_platform_pair=self.get_platform_pair(),
373
+ urn=urn,
374
+ )
375
+ ],
376
+ column_lineage=[],
377
+ )
378
+
379
+
380
+ class DatabricksLineage(AbstractLineage):
381
+ def form_qualified_table_name(
382
+ self,
383
+ table_reference: ReferencedTable,
384
+ data_platform_pair: DataPlatformPair,
385
+ ) -> str:
386
+ platform_detail: PlatformDetail = (
387
+ self.platform_instance_resolver.get_platform_instance(
388
+ PowerBIPlatformDetail(
389
+ data_platform_pair=data_platform_pair,
390
+ data_platform_server=table_reference.warehouse,
391
+ )
392
+ )
393
+ )
394
+
395
+ metastore: Optional[str] = None
396
+
397
+ qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
398
+
399
+ if isinstance(platform_detail, DataBricksPlatformDetail):
400
+ metastore = platform_detail.metastore
401
+
402
+ if metastore is not None:
403
+ return f"{metastore}.{qualified_table_name}"
404
+
405
+ return qualified_table_name
406
+
407
+ def create_lineage(
408
+ self, data_access_func_detail: DataAccessFunctionDetail
409
+ ) -> Lineage:
410
+ logger.debug(
411
+ f"Processing Databrick data-access function detail {data_access_func_detail}"
412
+ )
413
+ table_detail: Dict[str, str] = {}
414
+ temp_accessor: Optional[
415
+ Union[IdentifierAccessor, AbstractIdentifierAccessor]
416
+ ] = data_access_func_detail.identifier_accessor
417
+
418
+ while temp_accessor:
419
+ if isinstance(temp_accessor, IdentifierAccessor):
420
+ # Condition to handle databricks M-query pattern where table, schema and database all are present in
421
+ # the same invoke statement
422
+ if all(
423
+ element in temp_accessor.items
424
+ for element in ["Item", "Schema", "Catalog"]
425
+ ):
426
+ table_detail["Schema"] = temp_accessor.items["Schema"]
427
+ table_detail["Table"] = temp_accessor.items["Item"]
428
+ else:
429
+ table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
430
+ "Name"
431
+ ]
432
+
433
+ if temp_accessor.next is not None:
434
+ temp_accessor = temp_accessor.next
435
+ else:
436
+ break
437
+ else:
438
+ logger.debug(
439
+ "expecting instance to be IdentifierAccessor, please check if parsing is done properly"
440
+ )
441
+ return Lineage.empty()
442
+
443
+ table_reference = self.create_reference_table(
444
+ arg_list=data_access_func_detail.arg_list,
445
+ table_detail=table_detail,
446
+ )
447
+
448
+ if table_reference:
449
+ qualified_table_name: str = self.form_qualified_table_name(
450
+ table_reference=table_reference,
451
+ data_platform_pair=self.get_platform_pair(),
452
+ )
453
+
454
+ urn = make_urn(
455
+ config=self.config,
456
+ platform_instance_resolver=self.platform_instance_resolver,
457
+ data_platform_pair=self.get_platform_pair(),
458
+ server=table_reference.warehouse,
459
+ qualified_table_name=qualified_table_name,
460
+ )
461
+
462
+ return Lineage(
463
+ upstreams=[
464
+ DataPlatformTable(
465
+ data_platform_pair=self.get_platform_pair(),
466
+ urn=urn,
467
+ )
468
+ ],
469
+ column_lineage=[],
470
+ )
471
+
472
+ return Lineage.empty()
473
+
474
+ def get_platform_pair(self) -> DataPlatformPair:
475
+ return SupportedDataPlatform.DATABRICKS_SQL.value
476
+
477
+
478
+ class TwoStepDataAccessPattern(AbstractLineage, ABC):
479
+ """
480
+ These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern
481
+ let
482
+ Source = Sql.Database("localhost", "library"),
483
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
484
+ in
485
+ dbo_book_issue
486
+ """
487
+
488
+ def two_level_access_pattern(
489
+ self, data_access_func_detail: DataAccessFunctionDetail
490
+ ) -> Lineage:
491
+ logger.debug(
492
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
493
+ )
494
+
495
+ server, db_name = self.get_db_detail_from_argument(
496
+ data_access_func_detail.arg_list
497
+ )
498
+ if server is None or db_name is None:
499
+ return Lineage.empty() # Return an empty list
500
+
501
+ schema_name: str = cast(
502
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
503
+ ).items["Schema"]
504
+
505
+ table_name: str = cast(
506
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
507
+ ).items["Item"]
508
+
509
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
510
+
511
+ logger.debug(
512
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
513
+ )
514
+
515
+ urn = make_urn(
516
+ config=self.config,
517
+ platform_instance_resolver=self.platform_instance_resolver,
518
+ data_platform_pair=self.get_platform_pair(),
519
+ server=server,
520
+ qualified_table_name=qualified_table_name,
521
+ )
522
+ return Lineage(
523
+ upstreams=[
524
+ DataPlatformTable(
525
+ data_platform_pair=self.get_platform_pair(),
526
+ urn=urn,
527
+ )
528
+ ],
529
+ column_lineage=[],
530
+ )
531
+
532
+
533
+ class PostgresLineage(TwoStepDataAccessPattern):
534
+ def create_lineage(
535
+ self, data_access_func_detail: DataAccessFunctionDetail
536
+ ) -> Lineage:
537
+ return self.two_level_access_pattern(data_access_func_detail)
538
+
539
+ def get_platform_pair(self) -> DataPlatformPair:
540
+ return SupportedDataPlatform.POSTGRES_SQL.value
541
+
542
+
543
+ class MSSqlLineage(TwoStepDataAccessPattern):
544
+ # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
545
+ DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
546
+
547
+ def get_platform_pair(self) -> DataPlatformPair:
548
+ return SupportedDataPlatform.MS_SQL.value
549
+
550
+ def create_urn_using_old_parser(
551
+ self, query: str, db_name: str, server: str
552
+ ) -> List[DataPlatformTable]:
553
+ dataplatform_tables: List[DataPlatformTable] = []
554
+
555
+ tables: List[str] = native_sql_parser.get_tables(query)
556
+
557
+ for parsed_table in tables:
558
+ # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
559
+ components = [v.strip("[]") for v in parsed_table.split(".")]
560
+ if len(components) == 3:
561
+ database, schema, table = components
562
+ elif len(components) == 2:
563
+ schema, table = components
564
+ database = db_name
565
+ elif len(components) == 1:
566
+ (table,) = components
567
+ database = db_name
568
+ schema = MSSqlLineage.DEFAULT_SCHEMA
569
+ else:
570
+ self.reporter.warning(
571
+ title="Invalid table format",
572
+ message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as <db-name>.<schema-name>.<table-name> in the SQL.",
573
+ context=f"table-name={self.table.full_name}",
574
+ )
575
+ continue
576
+
577
+ qualified_table_name = f"{database}.{schema}.{table}"
578
+ urn = make_urn(
579
+ config=self.config,
580
+ platform_instance_resolver=self.platform_instance_resolver,
581
+ data_platform_pair=self.get_platform_pair(),
582
+ server=server,
583
+ qualified_table_name=qualified_table_name,
584
+ )
585
+ dataplatform_tables.append(
586
+ DataPlatformTable(
587
+ data_platform_pair=self.get_platform_pair(),
588
+ urn=urn,
589
+ )
590
+ )
591
+
592
+ logger.debug(f"Generated upstream tables = {dataplatform_tables}")
593
+
594
+ return dataplatform_tables
595
+
596
+ def create_lineage(
597
+ self, data_access_func_detail: DataAccessFunctionDetail
598
+ ) -> Lineage:
599
+ arguments: List[str] = tree_function.strip_char_from_list(
600
+ values=tree_function.remove_whitespaces_from_list(
601
+ tree_function.token_values(data_access_func_detail.arg_list)
602
+ ),
603
+ )
604
+
605
+ server, database = self.get_db_detail_from_argument(
606
+ data_access_func_detail.arg_list
607
+ )
608
+ if server is None or database is None:
609
+ return Lineage.empty() # Return an empty list
610
+
611
+ assert server
612
+ assert database # to silent the lint
613
+
614
+ query: Optional[str] = get_next_item(arguments, "Query")
615
+ if query:
616
+ if self.config.enable_advance_lineage_sql_construct is False:
617
+ # Use previous parser to generate URN to keep backward compatibility
618
+ return Lineage(
619
+ upstreams=self.create_urn_using_old_parser(
620
+ query=query,
621
+ db_name=database,
622
+ server=server,
623
+ ),
624
+ column_lineage=[],
625
+ )
626
+
627
+ return self.parse_custom_sql(
628
+ query=query,
629
+ database=database,
630
+ server=server,
631
+ schema=MSSqlLineage.DEFAULT_SCHEMA,
632
+ )
633
+
634
+ # It is a regular case of MS-SQL
635
+ logger.debug("Handling with regular case")
636
+ return self.two_level_access_pattern(data_access_func_detail)
637
+
638
+
639
+ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
640
+ def get_datasource_server(
641
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
642
+ ) -> str:
643
+ return tree_function.strip_char_from_list([arguments[0]])[0]
644
+
645
+ def create_lineage(
646
+ self, data_access_func_detail: DataAccessFunctionDetail
647
+ ) -> Lineage:
648
+ logger.debug(
649
+ f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
650
+ )
651
+
652
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
653
+ tree_function.token_values(data_access_func_detail.arg_list)
654
+ )
655
+ # First is database name
656
+ db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
657
+ # Second is schema name
658
+ schema_name: str = cast(
659
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
660
+ ).items["Name"]
661
+ # Third is table name
662
+ table_name: str = cast(
663
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
664
+ ).items["Name"]
665
+
666
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
667
+
668
+ logger.debug(
669
+ f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
670
+ )
671
+
672
+ server: str = self.get_datasource_server(arguments, data_access_func_detail)
673
+
674
+ urn = make_urn(
675
+ config=self.config,
676
+ platform_instance_resolver=self.platform_instance_resolver,
677
+ data_platform_pair=self.get_platform_pair(),
678
+ server=server,
679
+ qualified_table_name=qualified_table_name,
680
+ )
681
+
682
+ return Lineage(
683
+ upstreams=[
684
+ DataPlatformTable(
685
+ data_platform_pair=self.get_platform_pair(),
686
+ urn=urn,
687
+ )
688
+ ],
689
+ column_lineage=[],
690
+ )
691
+
692
+
693
+ class SnowflakeLineage(ThreeStepDataAccessPattern):
694
+ def get_platform_pair(self) -> DataPlatformPair:
695
+ return SupportedDataPlatform.SNOWFLAKE.value
696
+
697
+
698
+ class GoogleBigQueryLineage(ThreeStepDataAccessPattern):
699
+ def get_platform_pair(self) -> DataPlatformPair:
700
+ return SupportedDataPlatform.GOOGLE_BIGQUERY.value
701
+
702
+ def get_datasource_server(
703
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
704
+ ) -> str:
705
+ # In Google BigQuery server is project-name
706
+ # condition to silent lint, it is not going to be None
707
+ return (
708
+ data_access_func_detail.identifier_accessor.items["Name"]
709
+ if data_access_func_detail.identifier_accessor is not None
710
+ else ""
711
+ )
712
+
713
+
714
+ class NativeQueryLineage(AbstractLineage):
715
+ SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
716
+ SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
717
+ SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
718
+ SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
719
+ }
720
+ current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
721
+
722
+ def get_platform_pair(self) -> DataPlatformPair:
723
+ return self.current_data_platform.value
724
+
725
+ @staticmethod
726
+ def is_native_parsing_supported(data_access_function_name: str) -> bool:
727
+ return (
728
+ data_access_function_name
729
+ in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
730
+ )
731
+
732
+ def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
733
+ dataplatform_tables: List[DataPlatformTable] = []
734
+
735
+ tables: List[str] = native_sql_parser.get_tables(query)
736
+
737
+ for qualified_table_name in tables:
738
+ if len(qualified_table_name.split(".")) != 3:
739
+ logger.debug(
740
+ f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
741
+ )
742
+ continue
743
+
744
+ urn = make_urn(
745
+ config=self.config,
746
+ platform_instance_resolver=self.platform_instance_resolver,
747
+ data_platform_pair=self.get_platform_pair(),
748
+ server=server,
749
+ qualified_table_name=qualified_table_name,
750
+ )
751
+
752
+ dataplatform_tables.append(
753
+ DataPlatformTable(
754
+ data_platform_pair=self.get_platform_pair(),
755
+ urn=urn,
756
+ )
757
+ )
758
+
759
+ logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
760
+
761
+ return Lineage(
762
+ upstreams=dataplatform_tables,
763
+ column_lineage=[],
764
+ )
765
+
766
+ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
767
+ if (
768
+ data_access_tokens[0]
769
+ != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
770
+ ):
771
+ return None
772
+
773
+ database: Optional[str] = get_next_item(data_access_tokens, "Database")
774
+
775
+ if (
776
+ database and database != Constant.M_QUERY_NULL
777
+ ): # database name is explicitly set
778
+ return database
779
+
780
+ return get_next_item( # database name is set in Name argument
781
+ data_access_tokens, "Name"
782
+ ) or get_next_item( # If both above arguments are not available, then try Catalog
783
+ data_access_tokens, "Catalog"
784
+ )
785
+
786
+ def create_lineage(
787
+ self, data_access_func_detail: DataAccessFunctionDetail
788
+ ) -> Lineage:
789
+ t1: Tree = cast(
790
+ Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
791
+ )
792
+ flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
793
+
794
+ if len(flat_argument_list) != 2:
795
+ logger.debug(
796
+ f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
797
+ )
798
+ logger.debug(f"Flat argument list = {flat_argument_list}")
799
+ return Lineage.empty()
800
+
801
+ data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
802
+ tree_function.token_values(flat_argument_list[0])
803
+ )
804
+
805
+ if not self.is_native_parsing_supported(data_access_tokens[0]):
806
+ logger.debug(
807
+ f"Unsupported native-query data-platform = {data_access_tokens[0]}"
808
+ )
809
+ logger.debug(
810
+ f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
811
+ )
812
+
813
+ return Lineage.empty()
814
+
815
+ if len(data_access_tokens[0]) < 3:
816
+ logger.debug(
817
+ f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
818
+ "list"
819
+ )
820
+ return Lineage.empty()
821
+
822
+ self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
823
+ data_access_tokens[0]
824
+ ]
825
+ # The First argument is the query
826
+ sql_query: str = tree_function.strip_char_from_list(
827
+ values=tree_function.remove_whitespaces_from_list(
828
+ tree_function.token_values(flat_argument_list[1])
829
+ ),
830
+ )[
831
+ 0
832
+ ] # Remove any whitespaces and double quotes character
833
+
834
+ server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
835
+
836
+ if self.config.enable_advance_lineage_sql_construct is False:
837
+ # Use previous parser to generate URN to keep backward compatibility
838
+ return self.create_urn_using_old_parser(
839
+ query=sql_query,
840
+ server=server,
841
+ )
842
+
843
+ database_name: Optional[str] = self.get_db_name(data_access_tokens)
844
+
845
+ return self.parse_custom_sql(
846
+ query=sql_query,
847
+ server=server,
848
+ database=database_name,
849
+ schema=None,
850
+ )
851
+
852
+
853
+ class SupportedPattern(Enum):
854
+ DATABRICKS_QUERY = (
855
+ DatabricksLineage,
856
+ FunctionName.DATABRICK_DATA_ACCESS,
857
+ )
858
+
859
+ DATABRICKS_MULTI_CLOUD = (
860
+ DatabricksLineage,
861
+ FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
862
+ )
863
+
864
+ POSTGRES_SQL = (
865
+ PostgresLineage,
866
+ FunctionName.POSTGRESQL_DATA_ACCESS,
867
+ )
868
+
869
+ ORACLE = (
870
+ OracleLineage,
871
+ FunctionName.ORACLE_DATA_ACCESS,
872
+ )
873
+
874
+ SNOWFLAKE = (
875
+ SnowflakeLineage,
876
+ FunctionName.SNOWFLAKE_DATA_ACCESS,
877
+ )
878
+
879
+ MS_SQL = (
880
+ MSSqlLineage,
881
+ FunctionName.MSSQL_DATA_ACCESS,
882
+ )
883
+
884
+ GOOGLE_BIG_QUERY = (
885
+ GoogleBigQueryLineage,
886
+ FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
887
+ )
888
+
889
+ AMAZON_REDSHIFT = (
890
+ AmazonRedshiftLineage,
891
+ FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
892
+ )
893
+
894
+ NATIVE_QUERY = (
895
+ NativeQueryLineage,
896
+ FunctionName.NATIVE_QUERY,
897
+ )
898
+
899
+ def handler(self) -> Type[AbstractLineage]:
900
+ return self.value[0]
901
+
902
+ def function_name(self) -> str:
903
+ return self.value[1].value
904
+
905
+ @staticmethod
906
+ def get_function_names() -> List[str]:
907
+ functions: List[str] = []
908
+ for supported_resolver in SupportedPattern:
909
+ functions.append(supported_resolver.function_name())
910
+
911
+ return functions
912
+
913
+ @staticmethod
914
+ def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]:
915
+ logger.debug(f"Looking for pattern-handler for {function_name}")
916
+ for supported_resolver in SupportedPattern:
917
+ if function_name == supported_resolver.function_name():
918
+ return supported_resolver
919
+ logger.debug(f"pattern-handler not found for function_name {function_name}")
920
+ return None