acryl-datahub 0.15.0rc3__py3-none-any.whl → 0.15.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,286 +1,33 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
- from enum import Enum
5
- from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
3
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
6
4
 
7
5
  from lark import Tree
8
6
 
9
- import datahub.emitter.mce_builder as builder
10
7
  from datahub.ingestion.api.common import PipelineContext
11
8
  from datahub.ingestion.source.powerbi.config import (
12
- Constant,
13
- DataBricksPlatformDetail,
14
- DataPlatformPair,
15
- PlatformDetail,
16
9
  PowerBiDashboardSourceConfig,
17
10
  PowerBiDashboardSourceReport,
18
- PowerBIPlatformDetail,
19
- SupportedDataPlatform,
20
11
  )
21
12
  from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
22
13
  AbstractDataPlatformInstanceResolver,
23
14
  )
24
- from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
15
+ from datahub.ingestion.source.powerbi.m_query import tree_function
25
16
  from datahub.ingestion.source.powerbi.m_query.data_classes import (
26
17
  TRACE_POWERBI_MQUERY_PARSER,
27
- AbstractIdentifierAccessor,
28
18
  DataAccessFunctionDetail,
29
19
  IdentifierAccessor,
30
- ReferencedTable,
20
+ Lineage,
21
+ )
22
+ from datahub.ingestion.source.powerbi.m_query.pattern_handler import (
23
+ AbstractLineage,
24
+ SupportedPattern,
31
25
  )
32
26
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
- from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
34
27
 
35
28
  logger = logging.getLogger(__name__)
36
29
 
37
30
 
38
- @dataclass
39
- class DataPlatformTable:
40
- data_platform_pair: DataPlatformPair
41
- urn: str
42
-
43
-
44
- @dataclass
45
- class Lineage:
46
- upstreams: List[DataPlatformTable]
47
- column_lineage: List[ColumnLineageInfo]
48
-
49
- @staticmethod
50
- def empty() -> "Lineage":
51
- return Lineage(upstreams=[], column_lineage=[])
52
-
53
-
54
- def urn_to_lowercase(value: str, flag: bool) -> str:
55
- if flag is True:
56
- return value.lower()
57
-
58
- return value
59
-
60
-
61
- def urn_creator(
62
- config: PowerBiDashboardSourceConfig,
63
- platform_instance_resolver: AbstractDataPlatformInstanceResolver,
64
- data_platform_pair: DataPlatformPair,
65
- server: str,
66
- qualified_table_name: str,
67
- ) -> str:
68
- platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
69
- PowerBIPlatformDetail(
70
- data_platform_pair=data_platform_pair,
71
- data_platform_server=server,
72
- )
73
- )
74
-
75
- return builder.make_dataset_urn_with_platform_instance(
76
- platform=data_platform_pair.datahub_data_platform_name,
77
- platform_instance=platform_detail.platform_instance,
78
- env=platform_detail.env,
79
- name=urn_to_lowercase(
80
- qualified_table_name, config.convert_lineage_urns_to_lowercase
81
- ),
82
- )
83
-
84
-
85
- def get_next_item(items: List[str], item: str) -> Optional[str]:
86
- if item in items:
87
- try:
88
- index = items.index(item)
89
- return items[index + 1]
90
- except IndexError:
91
- logger.debug(f'item:"{item}", not found in item-list: {items}')
92
- return None
93
-
94
-
95
- class AbstractDataPlatformTableCreator(ABC):
96
- """
97
- Base class to share common functionalities among different dataplatform for M-Query parsing.
98
-
99
- To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
100
- the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query.
101
-
102
- let
103
- Source = Sql.Database("localhost", "library"),
104
- dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
105
- in
106
- dbo_book_issue
107
-
108
- It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument
109
- of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL.
110
-
111
- DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
112
-
113
- data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
114
- find out database-name , schema-name and table-name also varies as per dataplatform.
115
-
116
- Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query
117
-
118
- let
119
- Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
120
- in
121
- Source
122
-
123
- In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
124
-
125
- NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
126
-
127
- """
128
-
129
- ctx: PipelineContext
130
- table: Table
131
- config: PowerBiDashboardSourceConfig
132
- reporter: PowerBiDashboardSourceReport
133
- platform_instance_resolver: AbstractDataPlatformInstanceResolver
134
-
135
- def __init__(
136
- self,
137
- ctx: PipelineContext,
138
- table: Table,
139
- config: PowerBiDashboardSourceConfig,
140
- reporter: PowerBiDashboardSourceReport,
141
- platform_instance_resolver: AbstractDataPlatformInstanceResolver,
142
- ) -> None:
143
- super().__init__()
144
- self.ctx = ctx
145
- self.table = table
146
- self.config = config
147
- self.reporter = reporter
148
- self.platform_instance_resolver = platform_instance_resolver
149
-
150
- @abstractmethod
151
- def create_lineage(
152
- self, data_access_func_detail: DataAccessFunctionDetail
153
- ) -> Lineage:
154
- pass
155
-
156
- @abstractmethod
157
- def get_platform_pair(self) -> DataPlatformPair:
158
- pass
159
-
160
- @staticmethod
161
- def get_db_detail_from_argument(
162
- arg_list: Tree,
163
- ) -> Tuple[Optional[str], Optional[str]]:
164
- arguments: List[str] = tree_function.strip_char_from_list(
165
- values=tree_function.remove_whitespaces_from_list(
166
- tree_function.token_values(arg_list)
167
- ),
168
- )
169
-
170
- if len(arguments) < 2:
171
- logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
172
- return None, None
173
-
174
- return arguments[0], arguments[1]
175
-
176
- @staticmethod
177
- def create_reference_table(
178
- arg_list: Tree,
179
- table_detail: Dict[str, str],
180
- ) -> Optional[ReferencedTable]:
181
- arguments: List[str] = tree_function.strip_char_from_list(
182
- values=tree_function.remove_whitespaces_from_list(
183
- tree_function.token_values(arg_list)
184
- ),
185
- )
186
-
187
- logger.debug(f"Processing arguments {arguments}")
188
-
189
- if (
190
- len(arguments)
191
- >= 4 # [0] is warehouse FQDN.
192
- # [1] is endpoint, we are not using it.
193
- # [2] is "Catalog" key
194
- # [3] is catalog's value
195
- ):
196
- return ReferencedTable(
197
- warehouse=arguments[0],
198
- catalog=arguments[3],
199
- # As per my observation, database and catalog names are same in M-Query
200
- database=table_detail["Database"]
201
- if table_detail.get("Database")
202
- else arguments[3],
203
- schema=table_detail["Schema"],
204
- table=table_detail.get("Table") or table_detail["View"],
205
- )
206
- elif len(arguments) == 2:
207
- return ReferencedTable(
208
- warehouse=arguments[0],
209
- database=table_detail["Database"],
210
- schema=table_detail["Schema"],
211
- table=table_detail.get("Table") or table_detail["View"],
212
- catalog=None,
213
- )
214
-
215
- return None
216
-
217
- def parse_custom_sql(
218
- self, query: str, server: str, database: Optional[str], schema: Optional[str]
219
- ) -> Lineage:
220
- dataplatform_tables: List[DataPlatformTable] = []
221
-
222
- platform_detail: PlatformDetail = (
223
- self.platform_instance_resolver.get_platform_instance(
224
- PowerBIPlatformDetail(
225
- data_platform_pair=self.get_platform_pair(),
226
- data_platform_server=server,
227
- )
228
- )
229
- )
230
-
231
- query = native_sql_parser.remove_drop_statement(
232
- native_sql_parser.remove_special_characters(query)
233
- )
234
-
235
- parsed_result: Optional[
236
- "SqlParsingResult"
237
- ] = native_sql_parser.parse_custom_sql(
238
- ctx=self.ctx,
239
- query=query,
240
- platform=self.get_platform_pair().datahub_data_platform_name,
241
- platform_instance=platform_detail.platform_instance,
242
- env=platform_detail.env,
243
- database=database,
244
- schema=schema,
245
- )
246
-
247
- if parsed_result is None:
248
- self.reporter.info(
249
- title=Constant.SQL_PARSING_FAILURE,
250
- message="Fail to parse native sql present in PowerBI M-Query",
251
- context=f"table-name={self.table.full_name}, sql={query}",
252
- )
253
- return Lineage.empty()
254
-
255
- if parsed_result.debug_info and parsed_result.debug_info.table_error:
256
- self.reporter.warning(
257
- title=Constant.SQL_PARSING_FAILURE,
258
- message="Fail to parse native sql present in PowerBI M-Query",
259
- context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
260
- )
261
- return Lineage.empty()
262
-
263
- for urn in parsed_result.in_tables:
264
- dataplatform_tables.append(
265
- DataPlatformTable(
266
- data_platform_pair=self.get_platform_pair(),
267
- urn=urn,
268
- )
269
- )
270
-
271
- logger.debug(f"Native Query parsed result={parsed_result}")
272
- logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
273
-
274
- return Lineage(
275
- upstreams=dataplatform_tables,
276
- column_lineage=(
277
- parsed_result.column_lineage
278
- if parsed_result.column_lineage is not None
279
- else []
280
- ),
281
- )
282
-
283
-
284
31
  class AbstractDataAccessMQueryResolver(ABC):
285
32
  table: Table
286
33
  parse_tree: Tree
@@ -299,10 +46,10 @@ class AbstractDataAccessMQueryResolver(ABC):
299
46
  self.parse_tree = parse_tree
300
47
  self.reporter = reporter
301
48
  self.parameters = parameters
302
- self.data_access_functions = SupportedResolver.get_function_names()
49
+ self.data_access_functions = SupportedPattern.get_function_names()
303
50
 
304
51
  @abstractmethod
305
- def resolve_to_data_platform_table_list(
52
+ def resolve_to_lineage(
306
53
  self,
307
54
  ctx: PipelineContext,
308
55
  config: PowerBiDashboardSourceConfig,
@@ -318,7 +65,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
318
65
  This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail.
319
66
 
320
67
  Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator
321
- (see method resolve_to_data_platform_table_list).
68
+ (see method resolve_to_lineage).
322
69
 
323
70
  Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance
324
71
  to the respective DataPlatformTable instance as per dataplatform.
@@ -602,7 +349,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
602
349
 
603
350
  return table_links
604
351
 
605
- def resolve_to_data_platform_table_list(
352
+ def resolve_to_lineage(
606
353
  self,
607
354
  ctx: PipelineContext,
608
355
  config: PowerBiDashboardSourceConfig,
@@ -630,7 +377,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
630
377
  # Each item is data-access function
631
378
  for f_detail in table_links:
632
379
  # Get & Check if we support data-access-function available in M-Query
633
- supported_resolver = SupportedResolver.get_resolver(
380
+ supported_resolver = SupportedPattern.get_pattern_handler(
634
381
  f_detail.data_access_function_name
635
382
  )
636
383
  if supported_resolver is None:
@@ -643,11 +390,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
643
390
  )
644
391
  continue
645
392
 
646
- # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
647
- # & also pass additional information that will be need to generate urn
648
- table_qualified_name_creator: (
649
- AbstractDataPlatformTableCreator
650
- ) = supported_resolver.get_table_full_name_creator()(
393
+ # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
394
+ # & also pass additional information that will be need to generate lineage
395
+ pattern_handler: (AbstractLineage) = supported_resolver.handler()(
651
396
  ctx=ctx,
652
397
  table=self.table,
653
398
  config=config,
@@ -655,673 +400,6 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
655
400
  platform_instance_resolver=platform_instance_resolver,
656
401
  )
657
402
 
658
- lineage.append(table_qualified_name_creator.create_lineage(f_detail))
403
+ lineage.append(pattern_handler.create_lineage(f_detail))
659
404
 
660
405
  return lineage
661
-
662
-
663
- class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
664
- """
665
- These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern
666
- let
667
- Source = Sql.Database("localhost", "library"),
668
- dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
669
- in
670
- dbo_book_issue
671
- """
672
-
673
- def two_level_access_pattern(
674
- self, data_access_func_detail: DataAccessFunctionDetail
675
- ) -> Lineage:
676
- logger.debug(
677
- f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
678
- )
679
-
680
- server, db_name = self.get_db_detail_from_argument(
681
- data_access_func_detail.arg_list
682
- )
683
- if server is None or db_name is None:
684
- return Lineage.empty() # Return an empty list
685
-
686
- schema_name: str = cast(
687
- IdentifierAccessor, data_access_func_detail.identifier_accessor
688
- ).items["Schema"]
689
-
690
- table_name: str = cast(
691
- IdentifierAccessor, data_access_func_detail.identifier_accessor
692
- ).items["Item"]
693
-
694
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
695
-
696
- logger.debug(
697
- f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
698
- )
699
-
700
- urn = urn_creator(
701
- config=self.config,
702
- platform_instance_resolver=self.platform_instance_resolver,
703
- data_platform_pair=self.get_platform_pair(),
704
- server=server,
705
- qualified_table_name=qualified_table_name,
706
- )
707
- return Lineage(
708
- upstreams=[
709
- DataPlatformTable(
710
- data_platform_pair=self.get_platform_pair(),
711
- urn=urn,
712
- )
713
- ],
714
- column_lineage=[],
715
- )
716
-
717
-
718
- class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
719
- def create_lineage(
720
- self, data_access_func_detail: DataAccessFunctionDetail
721
- ) -> Lineage:
722
- return self.two_level_access_pattern(data_access_func_detail)
723
-
724
- def get_platform_pair(self) -> DataPlatformPair:
725
- return SupportedDataPlatform.POSTGRES_SQL.value
726
-
727
-
728
- class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
729
- # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
730
- DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
731
-
732
- def get_platform_pair(self) -> DataPlatformPair:
733
- return SupportedDataPlatform.MS_SQL.value
734
-
735
- def create_urn_using_old_parser(
736
- self, query: str, db_name: str, server: str
737
- ) -> List[DataPlatformTable]:
738
- dataplatform_tables: List[DataPlatformTable] = []
739
-
740
- tables: List[str] = native_sql_parser.get_tables(query)
741
-
742
- for parsed_table in tables:
743
- # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
744
- components = [v.strip("[]") for v in parsed_table.split(".")]
745
- if len(components) == 3:
746
- database, schema, table = components
747
- elif len(components) == 2:
748
- schema, table = components
749
- database = db_name
750
- elif len(components) == 1:
751
- (table,) = components
752
- database = db_name
753
- schema = MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA
754
- else:
755
- self.reporter.warning(
756
- title="Invalid table format",
757
- message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as <db-name>.<schema-name>.<table-name> in the SQL.",
758
- context=f"table-name={self.table.full_name}",
759
- )
760
- continue
761
-
762
- qualified_table_name = f"{database}.{schema}.{table}"
763
- urn = urn_creator(
764
- config=self.config,
765
- platform_instance_resolver=self.platform_instance_resolver,
766
- data_platform_pair=self.get_platform_pair(),
767
- server=server,
768
- qualified_table_name=qualified_table_name,
769
- )
770
- dataplatform_tables.append(
771
- DataPlatformTable(
772
- data_platform_pair=self.get_platform_pair(),
773
- urn=urn,
774
- )
775
- )
776
-
777
- logger.debug(f"Generated upstream tables = {dataplatform_tables}")
778
-
779
- return dataplatform_tables
780
-
781
- def create_lineage(
782
- self, data_access_func_detail: DataAccessFunctionDetail
783
- ) -> Lineage:
784
- arguments: List[str] = tree_function.strip_char_from_list(
785
- values=tree_function.remove_whitespaces_from_list(
786
- tree_function.token_values(data_access_func_detail.arg_list)
787
- ),
788
- )
789
-
790
- server, database = self.get_db_detail_from_argument(
791
- data_access_func_detail.arg_list
792
- )
793
- if server is None or database is None:
794
- return Lineage.empty() # Return an empty list
795
-
796
- assert server
797
- assert database # to silent the lint
798
-
799
- query: Optional[str] = get_next_item(arguments, "Query")
800
- if query:
801
- if self.config.enable_advance_lineage_sql_construct is False:
802
- # Use previous parser to generate URN to keep backward compatibility
803
- return Lineage(
804
- upstreams=self.create_urn_using_old_parser(
805
- query=query,
806
- db_name=database,
807
- server=server,
808
- ),
809
- column_lineage=[],
810
- )
811
-
812
- return self.parse_custom_sql(
813
- query=query,
814
- database=database,
815
- server=server,
816
- schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA,
817
- )
818
-
819
- # It is a regular case of MS-SQL
820
- logger.debug("Handling with regular case")
821
- return self.two_level_access_pattern(data_access_func_detail)
822
-
823
-
824
- class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator):
825
- def get_platform_pair(self) -> DataPlatformPair:
826
- return SupportedDataPlatform.ORACLE.value
827
-
828
- @staticmethod
829
- def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
830
- error_message: str = (
831
- f"The target argument ({value}) should in the format of <host-name>:<port>/<db-name>["
832
- ".<domain>]"
833
- )
834
- splitter_result: List[str] = value.split("/")
835
- if len(splitter_result) != 2:
836
- logger.debug(error_message)
837
- return None, None
838
-
839
- db_name = splitter_result[1].split(".")[0]
840
-
841
- return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
842
-
843
- def create_lineage(
844
- self, data_access_func_detail: DataAccessFunctionDetail
845
- ) -> Lineage:
846
- logger.debug(
847
- f"Processing Oracle data-access function detail {data_access_func_detail}"
848
- )
849
-
850
- arguments: List[str] = tree_function.remove_whitespaces_from_list(
851
- tree_function.token_values(data_access_func_detail.arg_list)
852
- )
853
-
854
- server, db_name = self._get_server_and_db_name(arguments[0])
855
-
856
- if db_name is None or server is None:
857
- return Lineage.empty()
858
-
859
- schema_name: str = cast(
860
- IdentifierAccessor, data_access_func_detail.identifier_accessor
861
- ).items["Schema"]
862
-
863
- table_name: str = cast(
864
- IdentifierAccessor,
865
- cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
866
- ).items["Name"]
867
-
868
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
869
-
870
- urn = urn_creator(
871
- config=self.config,
872
- platform_instance_resolver=self.platform_instance_resolver,
873
- data_platform_pair=self.get_platform_pair(),
874
- server=server,
875
- qualified_table_name=qualified_table_name,
876
- )
877
-
878
- return Lineage(
879
- upstreams=[
880
- DataPlatformTable(
881
- data_platform_pair=self.get_platform_pair(),
882
- urn=urn,
883
- )
884
- ],
885
- column_lineage=[],
886
- )
887
-
888
-
889
- class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator):
890
- def form_qualified_table_name(
891
- self,
892
- table_reference: ReferencedTable,
893
- data_platform_pair: DataPlatformPair,
894
- ) -> str:
895
- platform_detail: PlatformDetail = (
896
- self.platform_instance_resolver.get_platform_instance(
897
- PowerBIPlatformDetail(
898
- data_platform_pair=data_platform_pair,
899
- data_platform_server=table_reference.warehouse,
900
- )
901
- )
902
- )
903
-
904
- metastore: Optional[str] = None
905
-
906
- qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
907
-
908
- if isinstance(platform_detail, DataBricksPlatformDetail):
909
- metastore = platform_detail.metastore
910
-
911
- if metastore is not None:
912
- return f"{metastore}.{qualified_table_name}"
913
-
914
- return qualified_table_name
915
-
916
- def create_lineage(
917
- self, data_access_func_detail: DataAccessFunctionDetail
918
- ) -> Lineage:
919
- logger.debug(
920
- f"Processing Databrick data-access function detail {data_access_func_detail}"
921
- )
922
- table_detail: Dict[str, str] = {}
923
- temp_accessor: Optional[
924
- Union[IdentifierAccessor, AbstractIdentifierAccessor]
925
- ] = data_access_func_detail.identifier_accessor
926
-
927
- while temp_accessor:
928
- if isinstance(temp_accessor, IdentifierAccessor):
929
- # Condition to handle databricks M-query pattern where table, schema and database all are present in
930
- # the same invoke statement
931
- if all(
932
- element in temp_accessor.items
933
- for element in ["Item", "Schema", "Catalog"]
934
- ):
935
- table_detail["Schema"] = temp_accessor.items["Schema"]
936
- table_detail["Table"] = temp_accessor.items["Item"]
937
- else:
938
- table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
939
- "Name"
940
- ]
941
-
942
- if temp_accessor.next is not None:
943
- temp_accessor = temp_accessor.next
944
- else:
945
- break
946
- else:
947
- logger.debug(
948
- "expecting instance to be IdentifierAccessor, please check if parsing is done properly"
949
- )
950
- return Lineage.empty()
951
-
952
- table_reference = self.create_reference_table(
953
- arg_list=data_access_func_detail.arg_list,
954
- table_detail=table_detail,
955
- )
956
-
957
- if table_reference:
958
- qualified_table_name: str = self.form_qualified_table_name(
959
- table_reference=table_reference,
960
- data_platform_pair=self.get_platform_pair(),
961
- )
962
-
963
- urn = urn_creator(
964
- config=self.config,
965
- platform_instance_resolver=self.platform_instance_resolver,
966
- data_platform_pair=self.get_platform_pair(),
967
- server=table_reference.warehouse,
968
- qualified_table_name=qualified_table_name,
969
- )
970
-
971
- return Lineage(
972
- upstreams=[
973
- DataPlatformTable(
974
- data_platform_pair=self.get_platform_pair(),
975
- urn=urn,
976
- )
977
- ],
978
- column_lineage=[],
979
- )
980
-
981
- return Lineage.empty()
982
-
983
- def get_platform_pair(self) -> DataPlatformPair:
984
- return SupportedDataPlatform.DATABRICK_SQL.value
985
-
986
-
987
- class DefaultThreeStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
988
- def get_datasource_server(
989
- self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
990
- ) -> str:
991
- return tree_function.strip_char_from_list([arguments[0]])[0]
992
-
993
- def create_lineage(
994
- self, data_access_func_detail: DataAccessFunctionDetail
995
- ) -> Lineage:
996
- logger.debug(
997
- f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
998
- )
999
-
1000
- arguments: List[str] = tree_function.remove_whitespaces_from_list(
1001
- tree_function.token_values(data_access_func_detail.arg_list)
1002
- )
1003
- # First is database name
1004
- db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
1005
- # Second is schema name
1006
- schema_name: str = cast(
1007
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
1008
- ).items["Name"]
1009
- # Third is table name
1010
- table_name: str = cast(
1011
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
1012
- ).items["Name"]
1013
-
1014
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
1015
-
1016
- logger.debug(
1017
- f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
1018
- )
1019
-
1020
- server: str = self.get_datasource_server(arguments, data_access_func_detail)
1021
-
1022
- urn = urn_creator(
1023
- config=self.config,
1024
- platform_instance_resolver=self.platform_instance_resolver,
1025
- data_platform_pair=self.get_platform_pair(),
1026
- server=server,
1027
- qualified_table_name=qualified_table_name,
1028
- )
1029
-
1030
- return Lineage(
1031
- upstreams=[
1032
- DataPlatformTable(
1033
- data_platform_pair=self.get_platform_pair(),
1034
- urn=urn,
1035
- )
1036
- ],
1037
- column_lineage=[],
1038
- )
1039
-
1040
-
1041
- class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
1042
- def get_platform_pair(self) -> DataPlatformPair:
1043
- return SupportedDataPlatform.SNOWFLAKE.value
1044
-
1045
-
1046
- class GoogleBigQueryDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
1047
- def get_platform_pair(self) -> DataPlatformPair:
1048
- return SupportedDataPlatform.GOOGLE_BIGQUERY.value
1049
-
1050
- def get_datasource_server(
1051
- self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
1052
- ) -> str:
1053
- # In Google BigQuery server is project-name
1054
- # condition to silent lint, it is not going to be None
1055
- return (
1056
- data_access_func_detail.identifier_accessor.items["Name"]
1057
- if data_access_func_detail.identifier_accessor is not None
1058
- else ""
1059
- )
1060
-
1061
-
1062
- class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator):
1063
- def get_platform_pair(self) -> DataPlatformPair:
1064
- return SupportedDataPlatform.AMAZON_REDSHIFT.value
1065
-
1066
- def create_lineage(
1067
- self, data_access_func_detail: DataAccessFunctionDetail
1068
- ) -> Lineage:
1069
- logger.debug(
1070
- f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
1071
- )
1072
-
1073
- server, db_name = self.get_db_detail_from_argument(
1074
- data_access_func_detail.arg_list
1075
- )
1076
- if db_name is None or server is None:
1077
- return Lineage.empty() # Return empty list
1078
-
1079
- schema_name: str = cast(
1080
- IdentifierAccessor, data_access_func_detail.identifier_accessor
1081
- ).items["Name"]
1082
-
1083
- table_name: str = cast(
1084
- IdentifierAccessor,
1085
- cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
1086
- ).items["Name"]
1087
-
1088
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
1089
-
1090
- urn = urn_creator(
1091
- config=self.config,
1092
- platform_instance_resolver=self.platform_instance_resolver,
1093
- data_platform_pair=self.get_platform_pair(),
1094
- server=server,
1095
- qualified_table_name=qualified_table_name,
1096
- )
1097
-
1098
- return Lineage(
1099
- upstreams=[
1100
- DataPlatformTable(
1101
- data_platform_pair=self.get_platform_pair(),
1102
- urn=urn,
1103
- )
1104
- ],
1105
- column_lineage=[],
1106
- )
1107
-
1108
-
1109
- class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator):
1110
- SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
1111
- SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
1112
- SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
1113
- SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
1114
- }
1115
- current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
1116
-
1117
- def get_platform_pair(self) -> DataPlatformPair:
1118
- return self.current_data_platform.value
1119
-
1120
- @staticmethod
1121
- def is_native_parsing_supported(data_access_function_name: str) -> bool:
1122
- return (
1123
- data_access_function_name
1124
- in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
1125
- )
1126
-
1127
- def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
1128
- dataplatform_tables: List[DataPlatformTable] = []
1129
-
1130
- tables: List[str] = native_sql_parser.get_tables(query)
1131
-
1132
- for qualified_table_name in tables:
1133
- if len(qualified_table_name.split(".")) != 3:
1134
- logger.debug(
1135
- f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
1136
- )
1137
- continue
1138
-
1139
- urn = urn_creator(
1140
- config=self.config,
1141
- platform_instance_resolver=self.platform_instance_resolver,
1142
- data_platform_pair=self.get_platform_pair(),
1143
- server=server,
1144
- qualified_table_name=qualified_table_name,
1145
- )
1146
-
1147
- dataplatform_tables.append(
1148
- DataPlatformTable(
1149
- data_platform_pair=self.get_platform_pair(),
1150
- urn=urn,
1151
- )
1152
- )
1153
-
1154
- logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
1155
-
1156
- return Lineage(
1157
- upstreams=dataplatform_tables,
1158
- column_lineage=[],
1159
- )
1160
-
1161
- def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
1162
- if (
1163
- data_access_tokens[0]
1164
- != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
1165
- ):
1166
- return None
1167
-
1168
- database: Optional[str] = get_next_item(data_access_tokens, "Database")
1169
-
1170
- if (
1171
- database and database != Constant.M_QUERY_NULL
1172
- ): # database name is explicitly set
1173
- return database
1174
-
1175
- return get_next_item( # database name is set in Name argument
1176
- data_access_tokens, "Name"
1177
- ) or get_next_item( # If both above arguments are not available, then try Catalog
1178
- data_access_tokens, "Catalog"
1179
- )
1180
-
1181
- def create_lineage(
1182
- self, data_access_func_detail: DataAccessFunctionDetail
1183
- ) -> Lineage:
1184
- t1: Tree = cast(
1185
- Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
1186
- )
1187
- flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
1188
-
1189
- if len(flat_argument_list) != 2:
1190
- logger.debug(
1191
- f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
1192
- )
1193
- logger.debug(f"Flat argument list = {flat_argument_list}")
1194
- return Lineage.empty()
1195
-
1196
- data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
1197
- tree_function.token_values(flat_argument_list[0])
1198
- )
1199
-
1200
- if not self.is_native_parsing_supported(data_access_tokens[0]):
1201
- logger.debug(
1202
- f"Unsupported native-query data-platform = {data_access_tokens[0]}"
1203
- )
1204
- logger.debug(
1205
- f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
1206
- )
1207
-
1208
- return Lineage.empty()
1209
-
1210
- if len(data_access_tokens[0]) < 3:
1211
- logger.debug(
1212
- f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
1213
- "list"
1214
- )
1215
- return Lineage.empty()
1216
-
1217
- self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
1218
- data_access_tokens[0]
1219
- ]
1220
- # The First argument is the query
1221
- sql_query: str = tree_function.strip_char_from_list(
1222
- values=tree_function.remove_whitespaces_from_list(
1223
- tree_function.token_values(flat_argument_list[1])
1224
- ),
1225
- )[
1226
- 0
1227
- ] # Remove any whitespaces and double quotes character
1228
-
1229
- server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
1230
-
1231
- if self.config.enable_advance_lineage_sql_construct is False:
1232
- # Use previous parser to generate URN to keep backward compatibility
1233
- return self.create_urn_using_old_parser(
1234
- query=sql_query,
1235
- server=server,
1236
- )
1237
-
1238
- database_name: Optional[str] = self.get_db_name(data_access_tokens)
1239
-
1240
- return self.parse_custom_sql(
1241
- query=sql_query,
1242
- server=server,
1243
- database=database_name,
1244
- schema=None,
1245
- )
1246
-
1247
-
1248
- class FunctionName(Enum):
1249
- NATIVE_QUERY = "Value.NativeQuery"
1250
- POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
1251
- ORACLE_DATA_ACCESS = "Oracle.Database"
1252
- SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
1253
- MSSQL_DATA_ACCESS = "Sql.Database"
1254
- DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
1255
- GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
1256
- AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
1257
- DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
1258
-
1259
-
1260
- class SupportedResolver(Enum):
1261
- DATABRICKS_QUERY = (
1262
- DatabrickDataPlatformTableCreator,
1263
- FunctionName.DATABRICK_DATA_ACCESS,
1264
- )
1265
-
1266
- DATABRICKS_MULTI_CLOUD = (
1267
- DatabrickDataPlatformTableCreator,
1268
- FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
1269
- )
1270
-
1271
- POSTGRES_SQL = (
1272
- PostgresDataPlatformTableCreator,
1273
- FunctionName.POSTGRESQL_DATA_ACCESS,
1274
- )
1275
-
1276
- ORACLE = (
1277
- OracleDataPlatformTableCreator,
1278
- FunctionName.ORACLE_DATA_ACCESS,
1279
- )
1280
-
1281
- SNOWFLAKE = (
1282
- SnowflakeDataPlatformTableCreator,
1283
- FunctionName.SNOWFLAKE_DATA_ACCESS,
1284
- )
1285
-
1286
- MS_SQL = (
1287
- MSSqlDataPlatformTableCreator,
1288
- FunctionName.MSSQL_DATA_ACCESS,
1289
- )
1290
-
1291
- GOOGLE_BIG_QUERY = (
1292
- GoogleBigQueryDataPlatformTableCreator,
1293
- FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
1294
- )
1295
-
1296
- AMAZON_REDSHIFT = (
1297
- AmazonRedshiftDataPlatformTableCreator,
1298
- FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
1299
- )
1300
-
1301
- NATIVE_QUERY = (
1302
- NativeQueryDataPlatformTableCreator,
1303
- FunctionName.NATIVE_QUERY,
1304
- )
1305
-
1306
- def get_table_full_name_creator(self) -> Type[AbstractDataPlatformTableCreator]:
1307
- return self.value[0]
1308
-
1309
- def get_function_name(self) -> str:
1310
- return self.value[1].value
1311
-
1312
- @staticmethod
1313
- def get_function_names() -> List[str]:
1314
- functions: List[str] = []
1315
- for supported_resolver in SupportedResolver:
1316
- functions.append(supported_resolver.get_function_name())
1317
-
1318
- return functions
1319
-
1320
- @staticmethod
1321
- def get_resolver(function_name: str) -> Optional["SupportedResolver"]:
1322
- logger.debug(f"Looking for resolver {function_name}")
1323
- for supported_resolver in SupportedResolver:
1324
- if function_name == supported_resolver.get_function_name():
1325
- return supported_resolver
1326
- logger.debug(f"Resolver not found for function_name {function_name}")
1327
- return None