acryl-datahub 0.15.0rc2__py3-none-any.whl → 0.15.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc2.dist-info → acryl_datahub-0.15.0rc4.dist-info}/METADATA +2390 -2390
- {acryl_datahub-0.15.0rc2.dist-info → acryl_datahub-0.15.0rc4.dist-info}/RECORD +20 -19
- {acryl_datahub-0.15.0rc2.dist-info → acryl_datahub-0.15.0rc4.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/run/pipeline.py +5 -4
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +22 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +3 -1
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +34 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +920 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +16 -938
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/telemetry/telemetry.py +23 -9
- {acryl_datahub-0.15.0rc2.dist-info → acryl_datahub-0.15.0rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc2.dist-info → acryl_datahub-0.15.0rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,286 +1,33 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from
|
|
4
|
-
from enum import Enum
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
|
6
4
|
|
|
7
5
|
from lark import Tree
|
|
8
6
|
|
|
9
|
-
import datahub.emitter.mce_builder as builder
|
|
10
7
|
from datahub.ingestion.api.common import PipelineContext
|
|
11
8
|
from datahub.ingestion.source.powerbi.config import (
|
|
12
|
-
Constant,
|
|
13
|
-
DataBricksPlatformDetail,
|
|
14
|
-
DataPlatformPair,
|
|
15
|
-
PlatformDetail,
|
|
16
9
|
PowerBiDashboardSourceConfig,
|
|
17
10
|
PowerBiDashboardSourceReport,
|
|
18
|
-
PowerBIPlatformDetail,
|
|
19
|
-
SupportedDataPlatform,
|
|
20
11
|
)
|
|
21
12
|
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
|
|
22
13
|
AbstractDataPlatformInstanceResolver,
|
|
23
14
|
)
|
|
24
|
-
from datahub.ingestion.source.powerbi.m_query import
|
|
15
|
+
from datahub.ingestion.source.powerbi.m_query import tree_function
|
|
25
16
|
from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
26
17
|
TRACE_POWERBI_MQUERY_PARSER,
|
|
27
|
-
AbstractIdentifierAccessor,
|
|
28
18
|
DataAccessFunctionDetail,
|
|
29
19
|
IdentifierAccessor,
|
|
30
|
-
|
|
20
|
+
Lineage,
|
|
21
|
+
)
|
|
22
|
+
from datahub.ingestion.source.powerbi.m_query.pattern_handler import (
|
|
23
|
+
AbstractLineage,
|
|
24
|
+
SupportedPattern,
|
|
31
25
|
)
|
|
32
26
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
|
-
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
|
|
34
27
|
|
|
35
28
|
logger = logging.getLogger(__name__)
|
|
36
29
|
|
|
37
30
|
|
|
38
|
-
@dataclass
|
|
39
|
-
class DataPlatformTable:
|
|
40
|
-
data_platform_pair: DataPlatformPair
|
|
41
|
-
urn: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@dataclass
|
|
45
|
-
class Lineage:
|
|
46
|
-
upstreams: List[DataPlatformTable]
|
|
47
|
-
column_lineage: List[ColumnLineageInfo]
|
|
48
|
-
|
|
49
|
-
@staticmethod
|
|
50
|
-
def empty() -> "Lineage":
|
|
51
|
-
return Lineage(upstreams=[], column_lineage=[])
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def urn_to_lowercase(value: str, flag: bool) -> str:
|
|
55
|
-
if flag is True:
|
|
56
|
-
return value.lower()
|
|
57
|
-
|
|
58
|
-
return value
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def urn_creator(
|
|
62
|
-
config: PowerBiDashboardSourceConfig,
|
|
63
|
-
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
64
|
-
data_platform_pair: DataPlatformPair,
|
|
65
|
-
server: str,
|
|
66
|
-
qualified_table_name: str,
|
|
67
|
-
) -> str:
|
|
68
|
-
platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
|
|
69
|
-
PowerBIPlatformDetail(
|
|
70
|
-
data_platform_pair=data_platform_pair,
|
|
71
|
-
data_platform_server=server,
|
|
72
|
-
)
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
return builder.make_dataset_urn_with_platform_instance(
|
|
76
|
-
platform=data_platform_pair.datahub_data_platform_name,
|
|
77
|
-
platform_instance=platform_detail.platform_instance,
|
|
78
|
-
env=platform_detail.env,
|
|
79
|
-
name=urn_to_lowercase(
|
|
80
|
-
qualified_table_name, config.convert_lineage_urns_to_lowercase
|
|
81
|
-
),
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def get_next_item(items: List[str], item: str) -> Optional[str]:
|
|
86
|
-
if item in items:
|
|
87
|
-
try:
|
|
88
|
-
index = items.index(item)
|
|
89
|
-
return items[index + 1]
|
|
90
|
-
except IndexError:
|
|
91
|
-
logger.debug(f'item:"{item}", not found in item-list: {items}')
|
|
92
|
-
return None
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
class AbstractDataPlatformTableCreator(ABC):
|
|
96
|
-
"""
|
|
97
|
-
Base class to share common functionalities among different dataplatform for M-Query parsing.
|
|
98
|
-
|
|
99
|
-
To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
|
|
100
|
-
the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query.
|
|
101
|
-
|
|
102
|
-
let
|
|
103
|
-
Source = Sql.Database("localhost", "library"),
|
|
104
|
-
dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
|
|
105
|
-
in
|
|
106
|
-
dbo_book_issue
|
|
107
|
-
|
|
108
|
-
It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument
|
|
109
|
-
of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL.
|
|
110
|
-
|
|
111
|
-
DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
|
|
112
|
-
|
|
113
|
-
data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
|
|
114
|
-
find out database-name , schema-name and table-name also varies as per dataplatform.
|
|
115
|
-
|
|
116
|
-
Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query
|
|
117
|
-
|
|
118
|
-
let
|
|
119
|
-
Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
|
|
120
|
-
in
|
|
121
|
-
Source
|
|
122
|
-
|
|
123
|
-
In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
|
|
124
|
-
|
|
125
|
-
NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
|
|
126
|
-
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
ctx: PipelineContext
|
|
130
|
-
table: Table
|
|
131
|
-
config: PowerBiDashboardSourceConfig
|
|
132
|
-
reporter: PowerBiDashboardSourceReport
|
|
133
|
-
platform_instance_resolver: AbstractDataPlatformInstanceResolver
|
|
134
|
-
|
|
135
|
-
def __init__(
|
|
136
|
-
self,
|
|
137
|
-
ctx: PipelineContext,
|
|
138
|
-
table: Table,
|
|
139
|
-
config: PowerBiDashboardSourceConfig,
|
|
140
|
-
reporter: PowerBiDashboardSourceReport,
|
|
141
|
-
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
142
|
-
) -> None:
|
|
143
|
-
super().__init__()
|
|
144
|
-
self.ctx = ctx
|
|
145
|
-
self.table = table
|
|
146
|
-
self.config = config
|
|
147
|
-
self.reporter = reporter
|
|
148
|
-
self.platform_instance_resolver = platform_instance_resolver
|
|
149
|
-
|
|
150
|
-
@abstractmethod
|
|
151
|
-
def create_lineage(
|
|
152
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
153
|
-
) -> Lineage:
|
|
154
|
-
pass
|
|
155
|
-
|
|
156
|
-
@abstractmethod
|
|
157
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
158
|
-
pass
|
|
159
|
-
|
|
160
|
-
@staticmethod
|
|
161
|
-
def get_db_detail_from_argument(
|
|
162
|
-
arg_list: Tree,
|
|
163
|
-
) -> Tuple[Optional[str], Optional[str]]:
|
|
164
|
-
arguments: List[str] = tree_function.strip_char_from_list(
|
|
165
|
-
values=tree_function.remove_whitespaces_from_list(
|
|
166
|
-
tree_function.token_values(arg_list)
|
|
167
|
-
),
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
if len(arguments) < 2:
|
|
171
|
-
logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
|
|
172
|
-
return None, None
|
|
173
|
-
|
|
174
|
-
return arguments[0], arguments[1]
|
|
175
|
-
|
|
176
|
-
@staticmethod
|
|
177
|
-
def create_reference_table(
|
|
178
|
-
arg_list: Tree,
|
|
179
|
-
table_detail: Dict[str, str],
|
|
180
|
-
) -> Optional[ReferencedTable]:
|
|
181
|
-
arguments: List[str] = tree_function.strip_char_from_list(
|
|
182
|
-
values=tree_function.remove_whitespaces_from_list(
|
|
183
|
-
tree_function.token_values(arg_list)
|
|
184
|
-
),
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
logger.debug(f"Processing arguments {arguments}")
|
|
188
|
-
|
|
189
|
-
if (
|
|
190
|
-
len(arguments)
|
|
191
|
-
>= 4 # [0] is warehouse FQDN.
|
|
192
|
-
# [1] is endpoint, we are not using it.
|
|
193
|
-
# [2] is "Catalog" key
|
|
194
|
-
# [3] is catalog's value
|
|
195
|
-
):
|
|
196
|
-
return ReferencedTable(
|
|
197
|
-
warehouse=arguments[0],
|
|
198
|
-
catalog=arguments[3],
|
|
199
|
-
# As per my observation, database and catalog names are same in M-Query
|
|
200
|
-
database=table_detail["Database"]
|
|
201
|
-
if table_detail.get("Database")
|
|
202
|
-
else arguments[3],
|
|
203
|
-
schema=table_detail["Schema"],
|
|
204
|
-
table=table_detail.get("Table") or table_detail["View"],
|
|
205
|
-
)
|
|
206
|
-
elif len(arguments) == 2:
|
|
207
|
-
return ReferencedTable(
|
|
208
|
-
warehouse=arguments[0],
|
|
209
|
-
database=table_detail["Database"],
|
|
210
|
-
schema=table_detail["Schema"],
|
|
211
|
-
table=table_detail.get("Table") or table_detail["View"],
|
|
212
|
-
catalog=None,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
return None
|
|
216
|
-
|
|
217
|
-
def parse_custom_sql(
|
|
218
|
-
self, query: str, server: str, database: Optional[str], schema: Optional[str]
|
|
219
|
-
) -> Lineage:
|
|
220
|
-
dataplatform_tables: List[DataPlatformTable] = []
|
|
221
|
-
|
|
222
|
-
platform_detail: PlatformDetail = (
|
|
223
|
-
self.platform_instance_resolver.get_platform_instance(
|
|
224
|
-
PowerBIPlatformDetail(
|
|
225
|
-
data_platform_pair=self.get_platform_pair(),
|
|
226
|
-
data_platform_server=server,
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
query = native_sql_parser.remove_drop_statement(
|
|
232
|
-
native_sql_parser.remove_special_characters(query)
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
parsed_result: Optional[
|
|
236
|
-
"SqlParsingResult"
|
|
237
|
-
] = native_sql_parser.parse_custom_sql(
|
|
238
|
-
ctx=self.ctx,
|
|
239
|
-
query=query,
|
|
240
|
-
platform=self.get_platform_pair().datahub_data_platform_name,
|
|
241
|
-
platform_instance=platform_detail.platform_instance,
|
|
242
|
-
env=platform_detail.env,
|
|
243
|
-
database=database,
|
|
244
|
-
schema=schema,
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
if parsed_result is None:
|
|
248
|
-
self.reporter.info(
|
|
249
|
-
title=Constant.SQL_PARSING_FAILURE,
|
|
250
|
-
message="Fail to parse native sql present in PowerBI M-Query",
|
|
251
|
-
context=f"table-name={self.table.full_name}, sql={query}",
|
|
252
|
-
)
|
|
253
|
-
return Lineage.empty()
|
|
254
|
-
|
|
255
|
-
if parsed_result.debug_info and parsed_result.debug_info.table_error:
|
|
256
|
-
self.reporter.warning(
|
|
257
|
-
title=Constant.SQL_PARSING_FAILURE,
|
|
258
|
-
message="Fail to parse native sql present in PowerBI M-Query",
|
|
259
|
-
context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
|
|
260
|
-
)
|
|
261
|
-
return Lineage.empty()
|
|
262
|
-
|
|
263
|
-
for urn in parsed_result.in_tables:
|
|
264
|
-
dataplatform_tables.append(
|
|
265
|
-
DataPlatformTable(
|
|
266
|
-
data_platform_pair=self.get_platform_pair(),
|
|
267
|
-
urn=urn,
|
|
268
|
-
)
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
logger.debug(f"Native Query parsed result={parsed_result}")
|
|
272
|
-
logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
|
|
273
|
-
|
|
274
|
-
return Lineage(
|
|
275
|
-
upstreams=dataplatform_tables,
|
|
276
|
-
column_lineage=(
|
|
277
|
-
parsed_result.column_lineage
|
|
278
|
-
if parsed_result.column_lineage is not None
|
|
279
|
-
else []
|
|
280
|
-
),
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
|
|
284
31
|
class AbstractDataAccessMQueryResolver(ABC):
|
|
285
32
|
table: Table
|
|
286
33
|
parse_tree: Tree
|
|
@@ -299,10 +46,10 @@ class AbstractDataAccessMQueryResolver(ABC):
|
|
|
299
46
|
self.parse_tree = parse_tree
|
|
300
47
|
self.reporter = reporter
|
|
301
48
|
self.parameters = parameters
|
|
302
|
-
self.data_access_functions =
|
|
49
|
+
self.data_access_functions = SupportedPattern.get_function_names()
|
|
303
50
|
|
|
304
51
|
@abstractmethod
|
|
305
|
-
def
|
|
52
|
+
def resolve_to_lineage(
|
|
306
53
|
self,
|
|
307
54
|
ctx: PipelineContext,
|
|
308
55
|
config: PowerBiDashboardSourceConfig,
|
|
@@ -318,7 +65,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
318
65
|
This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail.
|
|
319
66
|
|
|
320
67
|
Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator
|
|
321
|
-
(see method
|
|
68
|
+
(see method resolve_to_lineage).
|
|
322
69
|
|
|
323
70
|
Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance
|
|
324
71
|
to the respective DataPlatformTable instance as per dataplatform.
|
|
@@ -602,7 +349,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
602
349
|
|
|
603
350
|
return table_links
|
|
604
351
|
|
|
605
|
-
def
|
|
352
|
+
def resolve_to_lineage(
|
|
606
353
|
self,
|
|
607
354
|
ctx: PipelineContext,
|
|
608
355
|
config: PowerBiDashboardSourceConfig,
|
|
@@ -630,7 +377,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
630
377
|
# Each item is data-access function
|
|
631
378
|
for f_detail in table_links:
|
|
632
379
|
# Get & Check if we support data-access-function available in M-Query
|
|
633
|
-
supported_resolver =
|
|
380
|
+
supported_resolver = SupportedPattern.get_pattern_handler(
|
|
634
381
|
f_detail.data_access_function_name
|
|
635
382
|
)
|
|
636
383
|
if supported_resolver is None:
|
|
@@ -643,11 +390,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
643
390
|
)
|
|
644
391
|
continue
|
|
645
392
|
|
|
646
|
-
# From supported_resolver enum get respective
|
|
647
|
-
# & also pass additional information that will be need to generate
|
|
648
|
-
|
|
649
|
-
AbstractDataPlatformTableCreator
|
|
650
|
-
) = supported_resolver.get_table_full_name_creator()(
|
|
393
|
+
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
394
|
+
# & also pass additional information that will be need to generate lineage
|
|
395
|
+
pattern_handler: (AbstractLineage) = supported_resolver.handler()(
|
|
651
396
|
ctx=ctx,
|
|
652
397
|
table=self.table,
|
|
653
398
|
config=config,
|
|
@@ -655,673 +400,6 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
655
400
|
platform_instance_resolver=platform_instance_resolver,
|
|
656
401
|
)
|
|
657
402
|
|
|
658
|
-
lineage.append(
|
|
403
|
+
lineage.append(pattern_handler.create_lineage(f_detail))
|
|
659
404
|
|
|
660
405
|
return lineage
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
|
|
664
|
-
"""
|
|
665
|
-
These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern
|
|
666
|
-
let
|
|
667
|
-
Source = Sql.Database("localhost", "library"),
|
|
668
|
-
dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
|
|
669
|
-
in
|
|
670
|
-
dbo_book_issue
|
|
671
|
-
"""
|
|
672
|
-
|
|
673
|
-
def two_level_access_pattern(
|
|
674
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
675
|
-
) -> Lineage:
|
|
676
|
-
logger.debug(
|
|
677
|
-
f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
server, db_name = self.get_db_detail_from_argument(
|
|
681
|
-
data_access_func_detail.arg_list
|
|
682
|
-
)
|
|
683
|
-
if server is None or db_name is None:
|
|
684
|
-
return Lineage.empty() # Return an empty list
|
|
685
|
-
|
|
686
|
-
schema_name: str = cast(
|
|
687
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
688
|
-
).items["Schema"]
|
|
689
|
-
|
|
690
|
-
table_name: str = cast(
|
|
691
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
692
|
-
).items["Item"]
|
|
693
|
-
|
|
694
|
-
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
695
|
-
|
|
696
|
-
logger.debug(
|
|
697
|
-
f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
|
|
698
|
-
)
|
|
699
|
-
|
|
700
|
-
urn = urn_creator(
|
|
701
|
-
config=self.config,
|
|
702
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
703
|
-
data_platform_pair=self.get_platform_pair(),
|
|
704
|
-
server=server,
|
|
705
|
-
qualified_table_name=qualified_table_name,
|
|
706
|
-
)
|
|
707
|
-
return Lineage(
|
|
708
|
-
upstreams=[
|
|
709
|
-
DataPlatformTable(
|
|
710
|
-
data_platform_pair=self.get_platform_pair(),
|
|
711
|
-
urn=urn,
|
|
712
|
-
)
|
|
713
|
-
],
|
|
714
|
-
column_lineage=[],
|
|
715
|
-
)
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
|
|
719
|
-
def create_lineage(
|
|
720
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
721
|
-
) -> Lineage:
|
|
722
|
-
return self.two_level_access_pattern(data_access_func_detail)
|
|
723
|
-
|
|
724
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
725
|
-
return SupportedDataPlatform.POSTGRES_SQL.value
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
|
|
729
|
-
# https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
|
|
730
|
-
DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
|
|
731
|
-
|
|
732
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
733
|
-
return SupportedDataPlatform.MS_SQL.value
|
|
734
|
-
|
|
735
|
-
def create_urn_using_old_parser(
|
|
736
|
-
self, query: str, db_name: str, server: str
|
|
737
|
-
) -> List[DataPlatformTable]:
|
|
738
|
-
dataplatform_tables: List[DataPlatformTable] = []
|
|
739
|
-
|
|
740
|
-
tables: List[str] = native_sql_parser.get_tables(query)
|
|
741
|
-
|
|
742
|
-
for parsed_table in tables:
|
|
743
|
-
# components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
|
|
744
|
-
components = [v.strip("[]") for v in parsed_table.split(".")]
|
|
745
|
-
if len(components) == 3:
|
|
746
|
-
database, schema, table = components
|
|
747
|
-
elif len(components) == 2:
|
|
748
|
-
schema, table = components
|
|
749
|
-
database = db_name
|
|
750
|
-
elif len(components) == 1:
|
|
751
|
-
(table,) = components
|
|
752
|
-
database = db_name
|
|
753
|
-
schema = MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA
|
|
754
|
-
else:
|
|
755
|
-
self.reporter.warning(
|
|
756
|
-
title="Invalid table format",
|
|
757
|
-
message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as <db-name>.<schema-name>.<table-name> in the SQL.",
|
|
758
|
-
context=f"table-name={self.table.full_name}",
|
|
759
|
-
)
|
|
760
|
-
continue
|
|
761
|
-
|
|
762
|
-
qualified_table_name = f"{database}.{schema}.{table}"
|
|
763
|
-
urn = urn_creator(
|
|
764
|
-
config=self.config,
|
|
765
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
766
|
-
data_platform_pair=self.get_platform_pair(),
|
|
767
|
-
server=server,
|
|
768
|
-
qualified_table_name=qualified_table_name,
|
|
769
|
-
)
|
|
770
|
-
dataplatform_tables.append(
|
|
771
|
-
DataPlatformTable(
|
|
772
|
-
data_platform_pair=self.get_platform_pair(),
|
|
773
|
-
urn=urn,
|
|
774
|
-
)
|
|
775
|
-
)
|
|
776
|
-
|
|
777
|
-
logger.debug(f"Generated upstream tables = {dataplatform_tables}")
|
|
778
|
-
|
|
779
|
-
return dataplatform_tables
|
|
780
|
-
|
|
781
|
-
def create_lineage(
|
|
782
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
783
|
-
) -> Lineage:
|
|
784
|
-
arguments: List[str] = tree_function.strip_char_from_list(
|
|
785
|
-
values=tree_function.remove_whitespaces_from_list(
|
|
786
|
-
tree_function.token_values(data_access_func_detail.arg_list)
|
|
787
|
-
),
|
|
788
|
-
)
|
|
789
|
-
|
|
790
|
-
server, database = self.get_db_detail_from_argument(
|
|
791
|
-
data_access_func_detail.arg_list
|
|
792
|
-
)
|
|
793
|
-
if server is None or database is None:
|
|
794
|
-
return Lineage.empty() # Return an empty list
|
|
795
|
-
|
|
796
|
-
assert server
|
|
797
|
-
assert database # to silent the lint
|
|
798
|
-
|
|
799
|
-
query: Optional[str] = get_next_item(arguments, "Query")
|
|
800
|
-
if query:
|
|
801
|
-
if self.config.enable_advance_lineage_sql_construct is False:
|
|
802
|
-
# Use previous parser to generate URN to keep backward compatibility
|
|
803
|
-
return Lineage(
|
|
804
|
-
upstreams=self.create_urn_using_old_parser(
|
|
805
|
-
query=query,
|
|
806
|
-
db_name=database,
|
|
807
|
-
server=server,
|
|
808
|
-
),
|
|
809
|
-
column_lineage=[],
|
|
810
|
-
)
|
|
811
|
-
|
|
812
|
-
return self.parse_custom_sql(
|
|
813
|
-
query=query,
|
|
814
|
-
database=database,
|
|
815
|
-
server=server,
|
|
816
|
-
schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA,
|
|
817
|
-
)
|
|
818
|
-
|
|
819
|
-
# It is a regular case of MS-SQL
|
|
820
|
-
logger.debug("Handling with regular case")
|
|
821
|
-
return self.two_level_access_pattern(data_access_func_detail)
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator):
|
|
825
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
826
|
-
return SupportedDataPlatform.ORACLE.value
|
|
827
|
-
|
|
828
|
-
@staticmethod
|
|
829
|
-
def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
|
|
830
|
-
error_message: str = (
|
|
831
|
-
f"The target argument ({value}) should in the format of <host-name>:<port>/<db-name>["
|
|
832
|
-
".<domain>]"
|
|
833
|
-
)
|
|
834
|
-
splitter_result: List[str] = value.split("/")
|
|
835
|
-
if len(splitter_result) != 2:
|
|
836
|
-
logger.debug(error_message)
|
|
837
|
-
return None, None
|
|
838
|
-
|
|
839
|
-
db_name = splitter_result[1].split(".")[0]
|
|
840
|
-
|
|
841
|
-
return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
|
|
842
|
-
|
|
843
|
-
def create_lineage(
|
|
844
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
845
|
-
) -> Lineage:
|
|
846
|
-
logger.debug(
|
|
847
|
-
f"Processing Oracle data-access function detail {data_access_func_detail}"
|
|
848
|
-
)
|
|
849
|
-
|
|
850
|
-
arguments: List[str] = tree_function.remove_whitespaces_from_list(
|
|
851
|
-
tree_function.token_values(data_access_func_detail.arg_list)
|
|
852
|
-
)
|
|
853
|
-
|
|
854
|
-
server, db_name = self._get_server_and_db_name(arguments[0])
|
|
855
|
-
|
|
856
|
-
if db_name is None or server is None:
|
|
857
|
-
return Lineage.empty()
|
|
858
|
-
|
|
859
|
-
schema_name: str = cast(
|
|
860
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
861
|
-
).items["Schema"]
|
|
862
|
-
|
|
863
|
-
table_name: str = cast(
|
|
864
|
-
IdentifierAccessor,
|
|
865
|
-
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
|
|
866
|
-
).items["Name"]
|
|
867
|
-
|
|
868
|
-
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
869
|
-
|
|
870
|
-
urn = urn_creator(
|
|
871
|
-
config=self.config,
|
|
872
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
873
|
-
data_platform_pair=self.get_platform_pair(),
|
|
874
|
-
server=server,
|
|
875
|
-
qualified_table_name=qualified_table_name,
|
|
876
|
-
)
|
|
877
|
-
|
|
878
|
-
return Lineage(
|
|
879
|
-
upstreams=[
|
|
880
|
-
DataPlatformTable(
|
|
881
|
-
data_platform_pair=self.get_platform_pair(),
|
|
882
|
-
urn=urn,
|
|
883
|
-
)
|
|
884
|
-
],
|
|
885
|
-
column_lineage=[],
|
|
886
|
-
)
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator):
|
|
890
|
-
def form_qualified_table_name(
|
|
891
|
-
self,
|
|
892
|
-
table_reference: ReferencedTable,
|
|
893
|
-
data_platform_pair: DataPlatformPair,
|
|
894
|
-
) -> str:
|
|
895
|
-
platform_detail: PlatformDetail = (
|
|
896
|
-
self.platform_instance_resolver.get_platform_instance(
|
|
897
|
-
PowerBIPlatformDetail(
|
|
898
|
-
data_platform_pair=data_platform_pair,
|
|
899
|
-
data_platform_server=table_reference.warehouse,
|
|
900
|
-
)
|
|
901
|
-
)
|
|
902
|
-
)
|
|
903
|
-
|
|
904
|
-
metastore: Optional[str] = None
|
|
905
|
-
|
|
906
|
-
qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
|
|
907
|
-
|
|
908
|
-
if isinstance(platform_detail, DataBricksPlatformDetail):
|
|
909
|
-
metastore = platform_detail.metastore
|
|
910
|
-
|
|
911
|
-
if metastore is not None:
|
|
912
|
-
return f"{metastore}.{qualified_table_name}"
|
|
913
|
-
|
|
914
|
-
return qualified_table_name
|
|
915
|
-
|
|
916
|
-
def create_lineage(
|
|
917
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
918
|
-
) -> Lineage:
|
|
919
|
-
logger.debug(
|
|
920
|
-
f"Processing Databrick data-access function detail {data_access_func_detail}"
|
|
921
|
-
)
|
|
922
|
-
table_detail: Dict[str, str] = {}
|
|
923
|
-
temp_accessor: Optional[
|
|
924
|
-
Union[IdentifierAccessor, AbstractIdentifierAccessor]
|
|
925
|
-
] = data_access_func_detail.identifier_accessor
|
|
926
|
-
|
|
927
|
-
while temp_accessor:
|
|
928
|
-
if isinstance(temp_accessor, IdentifierAccessor):
|
|
929
|
-
# Condition to handle databricks M-query pattern where table, schema and database all are present in
|
|
930
|
-
# the same invoke statement
|
|
931
|
-
if all(
|
|
932
|
-
element in temp_accessor.items
|
|
933
|
-
for element in ["Item", "Schema", "Catalog"]
|
|
934
|
-
):
|
|
935
|
-
table_detail["Schema"] = temp_accessor.items["Schema"]
|
|
936
|
-
table_detail["Table"] = temp_accessor.items["Item"]
|
|
937
|
-
else:
|
|
938
|
-
table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
|
|
939
|
-
"Name"
|
|
940
|
-
]
|
|
941
|
-
|
|
942
|
-
if temp_accessor.next is not None:
|
|
943
|
-
temp_accessor = temp_accessor.next
|
|
944
|
-
else:
|
|
945
|
-
break
|
|
946
|
-
else:
|
|
947
|
-
logger.debug(
|
|
948
|
-
"expecting instance to be IdentifierAccessor, please check if parsing is done properly"
|
|
949
|
-
)
|
|
950
|
-
return Lineage.empty()
|
|
951
|
-
|
|
952
|
-
table_reference = self.create_reference_table(
|
|
953
|
-
arg_list=data_access_func_detail.arg_list,
|
|
954
|
-
table_detail=table_detail,
|
|
955
|
-
)
|
|
956
|
-
|
|
957
|
-
if table_reference:
|
|
958
|
-
qualified_table_name: str = self.form_qualified_table_name(
|
|
959
|
-
table_reference=table_reference,
|
|
960
|
-
data_platform_pair=self.get_platform_pair(),
|
|
961
|
-
)
|
|
962
|
-
|
|
963
|
-
urn = urn_creator(
|
|
964
|
-
config=self.config,
|
|
965
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
966
|
-
data_platform_pair=self.get_platform_pair(),
|
|
967
|
-
server=table_reference.warehouse,
|
|
968
|
-
qualified_table_name=qualified_table_name,
|
|
969
|
-
)
|
|
970
|
-
|
|
971
|
-
return Lineage(
|
|
972
|
-
upstreams=[
|
|
973
|
-
DataPlatformTable(
|
|
974
|
-
data_platform_pair=self.get_platform_pair(),
|
|
975
|
-
urn=urn,
|
|
976
|
-
)
|
|
977
|
-
],
|
|
978
|
-
column_lineage=[],
|
|
979
|
-
)
|
|
980
|
-
|
|
981
|
-
return Lineage.empty()
|
|
982
|
-
|
|
983
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
984
|
-
return SupportedDataPlatform.DATABRICK_SQL.value
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
class DefaultThreeStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
|
|
988
|
-
def get_datasource_server(
|
|
989
|
-
self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
|
|
990
|
-
) -> str:
|
|
991
|
-
return tree_function.strip_char_from_list([arguments[0]])[0]
|
|
992
|
-
|
|
993
|
-
def create_lineage(
|
|
994
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
995
|
-
) -> Lineage:
|
|
996
|
-
logger.debug(
|
|
997
|
-
f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
|
|
998
|
-
)
|
|
999
|
-
|
|
1000
|
-
arguments: List[str] = tree_function.remove_whitespaces_from_list(
|
|
1001
|
-
tree_function.token_values(data_access_func_detail.arg_list)
|
|
1002
|
-
)
|
|
1003
|
-
# First is database name
|
|
1004
|
-
db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
|
|
1005
|
-
# Second is schema name
|
|
1006
|
-
schema_name: str = cast(
|
|
1007
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
|
|
1008
|
-
).items["Name"]
|
|
1009
|
-
# Third is table name
|
|
1010
|
-
table_name: str = cast(
|
|
1011
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
|
|
1012
|
-
).items["Name"]
|
|
1013
|
-
|
|
1014
|
-
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
1015
|
-
|
|
1016
|
-
logger.debug(
|
|
1017
|
-
f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
|
|
1018
|
-
)
|
|
1019
|
-
|
|
1020
|
-
server: str = self.get_datasource_server(arguments, data_access_func_detail)
|
|
1021
|
-
|
|
1022
|
-
urn = urn_creator(
|
|
1023
|
-
config=self.config,
|
|
1024
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
1025
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1026
|
-
server=server,
|
|
1027
|
-
qualified_table_name=qualified_table_name,
|
|
1028
|
-
)
|
|
1029
|
-
|
|
1030
|
-
return Lineage(
|
|
1031
|
-
upstreams=[
|
|
1032
|
-
DataPlatformTable(
|
|
1033
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1034
|
-
urn=urn,
|
|
1035
|
-
)
|
|
1036
|
-
],
|
|
1037
|
-
column_lineage=[],
|
|
1038
|
-
)
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
|
|
1042
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
1043
|
-
return SupportedDataPlatform.SNOWFLAKE.value
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
class GoogleBigQueryDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
|
|
1047
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
1048
|
-
return SupportedDataPlatform.GOOGLE_BIGQUERY.value
|
|
1049
|
-
|
|
1050
|
-
def get_datasource_server(
|
|
1051
|
-
self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
|
|
1052
|
-
) -> str:
|
|
1053
|
-
# In Google BigQuery server is project-name
|
|
1054
|
-
# condition to silent lint, it is not going to be None
|
|
1055
|
-
return (
|
|
1056
|
-
data_access_func_detail.identifier_accessor.items["Name"]
|
|
1057
|
-
if data_access_func_detail.identifier_accessor is not None
|
|
1058
|
-
else ""
|
|
1059
|
-
)
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator):
|
|
1063
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
1064
|
-
return SupportedDataPlatform.AMAZON_REDSHIFT.value
|
|
1065
|
-
|
|
1066
|
-
def create_lineage(
|
|
1067
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
1068
|
-
) -> Lineage:
|
|
1069
|
-
logger.debug(
|
|
1070
|
-
f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
|
|
1071
|
-
)
|
|
1072
|
-
|
|
1073
|
-
server, db_name = self.get_db_detail_from_argument(
|
|
1074
|
-
data_access_func_detail.arg_list
|
|
1075
|
-
)
|
|
1076
|
-
if db_name is None or server is None:
|
|
1077
|
-
return Lineage.empty() # Return empty list
|
|
1078
|
-
|
|
1079
|
-
schema_name: str = cast(
|
|
1080
|
-
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
1081
|
-
).items["Name"]
|
|
1082
|
-
|
|
1083
|
-
table_name: str = cast(
|
|
1084
|
-
IdentifierAccessor,
|
|
1085
|
-
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
|
|
1086
|
-
).items["Name"]
|
|
1087
|
-
|
|
1088
|
-
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
1089
|
-
|
|
1090
|
-
urn = urn_creator(
|
|
1091
|
-
config=self.config,
|
|
1092
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
1093
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1094
|
-
server=server,
|
|
1095
|
-
qualified_table_name=qualified_table_name,
|
|
1096
|
-
)
|
|
1097
|
-
|
|
1098
|
-
return Lineage(
|
|
1099
|
-
upstreams=[
|
|
1100
|
-
DataPlatformTable(
|
|
1101
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1102
|
-
urn=urn,
|
|
1103
|
-
)
|
|
1104
|
-
],
|
|
1105
|
-
column_lineage=[],
|
|
1106
|
-
)
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator):
|
|
1110
|
-
SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
|
|
1111
|
-
SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
|
|
1112
|
-
SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
|
|
1113
|
-
SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
|
|
1114
|
-
}
|
|
1115
|
-
current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
|
|
1116
|
-
|
|
1117
|
-
def get_platform_pair(self) -> DataPlatformPair:
|
|
1118
|
-
return self.current_data_platform.value
|
|
1119
|
-
|
|
1120
|
-
@staticmethod
|
|
1121
|
-
def is_native_parsing_supported(data_access_function_name: str) -> bool:
|
|
1122
|
-
return (
|
|
1123
|
-
data_access_function_name
|
|
1124
|
-
in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
|
|
1125
|
-
)
|
|
1126
|
-
|
|
1127
|
-
def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
|
|
1128
|
-
dataplatform_tables: List[DataPlatformTable] = []
|
|
1129
|
-
|
|
1130
|
-
tables: List[str] = native_sql_parser.get_tables(query)
|
|
1131
|
-
|
|
1132
|
-
for qualified_table_name in tables:
|
|
1133
|
-
if len(qualified_table_name.split(".")) != 3:
|
|
1134
|
-
logger.debug(
|
|
1135
|
-
f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
|
|
1136
|
-
)
|
|
1137
|
-
continue
|
|
1138
|
-
|
|
1139
|
-
urn = urn_creator(
|
|
1140
|
-
config=self.config,
|
|
1141
|
-
platform_instance_resolver=self.platform_instance_resolver,
|
|
1142
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1143
|
-
server=server,
|
|
1144
|
-
qualified_table_name=qualified_table_name,
|
|
1145
|
-
)
|
|
1146
|
-
|
|
1147
|
-
dataplatform_tables.append(
|
|
1148
|
-
DataPlatformTable(
|
|
1149
|
-
data_platform_pair=self.get_platform_pair(),
|
|
1150
|
-
urn=urn,
|
|
1151
|
-
)
|
|
1152
|
-
)
|
|
1153
|
-
|
|
1154
|
-
logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
|
|
1155
|
-
|
|
1156
|
-
return Lineage(
|
|
1157
|
-
upstreams=dataplatform_tables,
|
|
1158
|
-
column_lineage=[],
|
|
1159
|
-
)
|
|
1160
|
-
|
|
1161
|
-
def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
|
|
1162
|
-
if (
|
|
1163
|
-
data_access_tokens[0]
|
|
1164
|
-
!= SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
|
|
1165
|
-
):
|
|
1166
|
-
return None
|
|
1167
|
-
|
|
1168
|
-
database: Optional[str] = get_next_item(data_access_tokens, "Database")
|
|
1169
|
-
|
|
1170
|
-
if (
|
|
1171
|
-
database and database != Constant.M_QUERY_NULL
|
|
1172
|
-
): # database name is explicitly set
|
|
1173
|
-
return database
|
|
1174
|
-
|
|
1175
|
-
return get_next_item( # database name is set in Name argument
|
|
1176
|
-
data_access_tokens, "Name"
|
|
1177
|
-
) or get_next_item( # If both above arguments are not available, then try Catalog
|
|
1178
|
-
data_access_tokens, "Catalog"
|
|
1179
|
-
)
|
|
1180
|
-
|
|
1181
|
-
def create_lineage(
|
|
1182
|
-
self, data_access_func_detail: DataAccessFunctionDetail
|
|
1183
|
-
) -> Lineage:
|
|
1184
|
-
t1: Tree = cast(
|
|
1185
|
-
Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
|
|
1186
|
-
)
|
|
1187
|
-
flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
|
|
1188
|
-
|
|
1189
|
-
if len(flat_argument_list) != 2:
|
|
1190
|
-
logger.debug(
|
|
1191
|
-
f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
|
|
1192
|
-
)
|
|
1193
|
-
logger.debug(f"Flat argument list = {flat_argument_list}")
|
|
1194
|
-
return Lineage.empty()
|
|
1195
|
-
|
|
1196
|
-
data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
|
|
1197
|
-
tree_function.token_values(flat_argument_list[0])
|
|
1198
|
-
)
|
|
1199
|
-
|
|
1200
|
-
if not self.is_native_parsing_supported(data_access_tokens[0]):
|
|
1201
|
-
logger.debug(
|
|
1202
|
-
f"Unsupported native-query data-platform = {data_access_tokens[0]}"
|
|
1203
|
-
)
|
|
1204
|
-
logger.debug(
|
|
1205
|
-
f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
|
|
1206
|
-
)
|
|
1207
|
-
|
|
1208
|
-
return Lineage.empty()
|
|
1209
|
-
|
|
1210
|
-
if len(data_access_tokens[0]) < 3:
|
|
1211
|
-
logger.debug(
|
|
1212
|
-
f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
|
|
1213
|
-
"list"
|
|
1214
|
-
)
|
|
1215
|
-
return Lineage.empty()
|
|
1216
|
-
|
|
1217
|
-
self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
|
|
1218
|
-
data_access_tokens[0]
|
|
1219
|
-
]
|
|
1220
|
-
# The First argument is the query
|
|
1221
|
-
sql_query: str = tree_function.strip_char_from_list(
|
|
1222
|
-
values=tree_function.remove_whitespaces_from_list(
|
|
1223
|
-
tree_function.token_values(flat_argument_list[1])
|
|
1224
|
-
),
|
|
1225
|
-
)[
|
|
1226
|
-
0
|
|
1227
|
-
] # Remove any whitespaces and double quotes character
|
|
1228
|
-
|
|
1229
|
-
server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
|
|
1230
|
-
|
|
1231
|
-
if self.config.enable_advance_lineage_sql_construct is False:
|
|
1232
|
-
# Use previous parser to generate URN to keep backward compatibility
|
|
1233
|
-
return self.create_urn_using_old_parser(
|
|
1234
|
-
query=sql_query,
|
|
1235
|
-
server=server,
|
|
1236
|
-
)
|
|
1237
|
-
|
|
1238
|
-
database_name: Optional[str] = self.get_db_name(data_access_tokens)
|
|
1239
|
-
|
|
1240
|
-
return self.parse_custom_sql(
|
|
1241
|
-
query=sql_query,
|
|
1242
|
-
server=server,
|
|
1243
|
-
database=database_name,
|
|
1244
|
-
schema=None,
|
|
1245
|
-
)
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
class FunctionName(Enum):
|
|
1249
|
-
NATIVE_QUERY = "Value.NativeQuery"
|
|
1250
|
-
POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
|
|
1251
|
-
ORACLE_DATA_ACCESS = "Oracle.Database"
|
|
1252
|
-
SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
|
|
1253
|
-
MSSQL_DATA_ACCESS = "Sql.Database"
|
|
1254
|
-
DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
|
|
1255
|
-
GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
|
|
1256
|
-
AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
|
|
1257
|
-
DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
class SupportedResolver(Enum):
|
|
1261
|
-
DATABRICKS_QUERY = (
|
|
1262
|
-
DatabrickDataPlatformTableCreator,
|
|
1263
|
-
FunctionName.DATABRICK_DATA_ACCESS,
|
|
1264
|
-
)
|
|
1265
|
-
|
|
1266
|
-
DATABRICKS_MULTI_CLOUD = (
|
|
1267
|
-
DatabrickDataPlatformTableCreator,
|
|
1268
|
-
FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
|
|
1269
|
-
)
|
|
1270
|
-
|
|
1271
|
-
POSTGRES_SQL = (
|
|
1272
|
-
PostgresDataPlatformTableCreator,
|
|
1273
|
-
FunctionName.POSTGRESQL_DATA_ACCESS,
|
|
1274
|
-
)
|
|
1275
|
-
|
|
1276
|
-
ORACLE = (
|
|
1277
|
-
OracleDataPlatformTableCreator,
|
|
1278
|
-
FunctionName.ORACLE_DATA_ACCESS,
|
|
1279
|
-
)
|
|
1280
|
-
|
|
1281
|
-
SNOWFLAKE = (
|
|
1282
|
-
SnowflakeDataPlatformTableCreator,
|
|
1283
|
-
FunctionName.SNOWFLAKE_DATA_ACCESS,
|
|
1284
|
-
)
|
|
1285
|
-
|
|
1286
|
-
MS_SQL = (
|
|
1287
|
-
MSSqlDataPlatformTableCreator,
|
|
1288
|
-
FunctionName.MSSQL_DATA_ACCESS,
|
|
1289
|
-
)
|
|
1290
|
-
|
|
1291
|
-
GOOGLE_BIG_QUERY = (
|
|
1292
|
-
GoogleBigQueryDataPlatformTableCreator,
|
|
1293
|
-
FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
|
|
1294
|
-
)
|
|
1295
|
-
|
|
1296
|
-
AMAZON_REDSHIFT = (
|
|
1297
|
-
AmazonRedshiftDataPlatformTableCreator,
|
|
1298
|
-
FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
|
|
1299
|
-
)
|
|
1300
|
-
|
|
1301
|
-
NATIVE_QUERY = (
|
|
1302
|
-
NativeQueryDataPlatformTableCreator,
|
|
1303
|
-
FunctionName.NATIVE_QUERY,
|
|
1304
|
-
)
|
|
1305
|
-
|
|
1306
|
-
def get_table_full_name_creator(self) -> Type[AbstractDataPlatformTableCreator]:
|
|
1307
|
-
return self.value[0]
|
|
1308
|
-
|
|
1309
|
-
def get_function_name(self) -> str:
|
|
1310
|
-
return self.value[1].value
|
|
1311
|
-
|
|
1312
|
-
@staticmethod
|
|
1313
|
-
def get_function_names() -> List[str]:
|
|
1314
|
-
functions: List[str] = []
|
|
1315
|
-
for supported_resolver in SupportedResolver:
|
|
1316
|
-
functions.append(supported_resolver.get_function_name())
|
|
1317
|
-
|
|
1318
|
-
return functions
|
|
1319
|
-
|
|
1320
|
-
@staticmethod
|
|
1321
|
-
def get_resolver(function_name: str) -> Optional["SupportedResolver"]:
|
|
1322
|
-
logger.debug(f"Looking for resolver {function_name}")
|
|
1323
|
-
for supported_resolver in SupportedResolver:
|
|
1324
|
-
if function_name == supported_resolver.get_function_name():
|
|
1325
|
-
return supported_resolver
|
|
1326
|
-
logger.debug(f"Resolver not found for function_name {function_name}")
|
|
1327
|
-
return None
|