acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,912 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Dict, List, Optional, Tuple, Type, cast
|
|
5
|
+
|
|
6
|
+
from lark import Tree
|
|
7
|
+
|
|
8
|
+
from datahub.emitter import mce_builder as builder
|
|
9
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.source.powerbi.config import (
|
|
11
|
+
Constant,
|
|
12
|
+
DataBricksPlatformDetail,
|
|
13
|
+
DataPlatformPair,
|
|
14
|
+
PlatformDetail,
|
|
15
|
+
PowerBiDashboardSourceConfig,
|
|
16
|
+
PowerBiDashboardSourceReport,
|
|
17
|
+
PowerBIPlatformDetail,
|
|
18
|
+
SupportedDataPlatform,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
|
|
21
|
+
AbstractDataPlatformInstanceResolver,
|
|
22
|
+
)
|
|
23
|
+
from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
|
|
24
|
+
from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
25
|
+
DataAccessFunctionDetail,
|
|
26
|
+
DataPlatformTable,
|
|
27
|
+
FunctionName,
|
|
28
|
+
IdentifierAccessor,
|
|
29
|
+
Lineage,
|
|
30
|
+
ReferencedTable,
|
|
31
|
+
)
|
|
32
|
+
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
|
+
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_next_item(items: List[str], item: str) -> Optional[str]:
|
|
39
|
+
if item in items:
|
|
40
|
+
try:
|
|
41
|
+
index = items.index(item)
|
|
42
|
+
return items[index + 1]
|
|
43
|
+
except IndexError:
|
|
44
|
+
logger.debug(f'item:"{item}", not found in item-list: {items}')
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def urn_to_lowercase(value: str, flag: bool) -> str:
|
|
49
|
+
if flag is True:
|
|
50
|
+
return value.lower()
|
|
51
|
+
|
|
52
|
+
return value
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def make_urn(
|
|
56
|
+
config: PowerBiDashboardSourceConfig,
|
|
57
|
+
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
58
|
+
data_platform_pair: DataPlatformPair,
|
|
59
|
+
server: str,
|
|
60
|
+
qualified_table_name: str,
|
|
61
|
+
) -> str:
|
|
62
|
+
platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
|
|
63
|
+
PowerBIPlatformDetail(
|
|
64
|
+
data_platform_pair=data_platform_pair,
|
|
65
|
+
data_platform_server=server,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return builder.make_dataset_urn_with_platform_instance(
|
|
70
|
+
platform=data_platform_pair.datahub_data_platform_name,
|
|
71
|
+
platform_instance=platform_detail.platform_instance,
|
|
72
|
+
env=platform_detail.env,
|
|
73
|
+
name=urn_to_lowercase(
|
|
74
|
+
qualified_table_name, config.convert_lineage_urns_to_lowercase
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class AbstractLineage(ABC):
|
|
80
|
+
"""
|
|
81
|
+
Base class to share common functionalities among different dataplatform for M-Query parsing.
|
|
82
|
+
|
|
83
|
+
To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
|
|
84
|
+
the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query.
|
|
85
|
+
|
|
86
|
+
let
|
|
87
|
+
Source = Sql.Database("localhost", "library"),
|
|
88
|
+
dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
|
|
89
|
+
in
|
|
90
|
+
dbo_book_issue
|
|
91
|
+
|
|
92
|
+
It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL.
|
|
93
|
+
|
|
94
|
+
DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
|
|
95
|
+
|
|
96
|
+
data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
|
|
97
|
+
find out database-name , schema-name and table-name also varies as per dataplatform.
|
|
98
|
+
|
|
99
|
+
Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query
|
|
100
|
+
|
|
101
|
+
let
|
|
102
|
+
Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
|
|
103
|
+
in
|
|
104
|
+
Source
|
|
105
|
+
|
|
106
|
+
In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
|
|
107
|
+
|
|
108
|
+
NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
ctx: PipelineContext
|
|
113
|
+
table: Table
|
|
114
|
+
config: PowerBiDashboardSourceConfig
|
|
115
|
+
reporter: PowerBiDashboardSourceReport
|
|
116
|
+
platform_instance_resolver: AbstractDataPlatformInstanceResolver
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
ctx: PipelineContext,
|
|
121
|
+
table: Table,
|
|
122
|
+
config: PowerBiDashboardSourceConfig,
|
|
123
|
+
reporter: PowerBiDashboardSourceReport,
|
|
124
|
+
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
125
|
+
) -> None:
|
|
126
|
+
super().__init__()
|
|
127
|
+
self.ctx = ctx
|
|
128
|
+
self.table = table
|
|
129
|
+
self.config = config
|
|
130
|
+
self.reporter = reporter
|
|
131
|
+
self.platform_instance_resolver = platform_instance_resolver
|
|
132
|
+
|
|
133
|
+
@abstractmethod
|
|
134
|
+
def create_lineage(
|
|
135
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
136
|
+
) -> Lineage:
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def get_db_detail_from_argument(
|
|
145
|
+
arg_list: Tree,
|
|
146
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
147
|
+
arguments: List[str] = tree_function.strip_char_from_list(
|
|
148
|
+
values=tree_function.remove_whitespaces_from_list(
|
|
149
|
+
tree_function.token_values(arg_list)
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if len(arguments) < 2:
|
|
154
|
+
logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
|
|
155
|
+
return None, None
|
|
156
|
+
|
|
157
|
+
return arguments[0], arguments[1]
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def create_reference_table(
|
|
161
|
+
arg_list: Tree,
|
|
162
|
+
table_detail: Dict[str, str],
|
|
163
|
+
) -> Optional[ReferencedTable]:
|
|
164
|
+
arguments: List[str] = tree_function.strip_char_from_list(
|
|
165
|
+
values=tree_function.remove_whitespaces_from_list(
|
|
166
|
+
tree_function.token_values(arg_list)
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
logger.debug(f"Processing arguments {arguments}")
|
|
171
|
+
|
|
172
|
+
if (
|
|
173
|
+
len(arguments)
|
|
174
|
+
>= 4 # [0] is warehouse FQDN.
|
|
175
|
+
# [1] is endpoint, we are not using it.
|
|
176
|
+
# [2] is "Catalog" key
|
|
177
|
+
# [3] is catalog's value
|
|
178
|
+
):
|
|
179
|
+
return ReferencedTable(
|
|
180
|
+
warehouse=arguments[0],
|
|
181
|
+
catalog=arguments[3],
|
|
182
|
+
# As per my observation, database and catalog names are same in M-Query
|
|
183
|
+
database=table_detail["Database"]
|
|
184
|
+
if table_detail.get("Database")
|
|
185
|
+
else arguments[3],
|
|
186
|
+
schema=table_detail["Schema"],
|
|
187
|
+
table=table_detail.get("Table") or table_detail["View"],
|
|
188
|
+
)
|
|
189
|
+
elif len(arguments) == 2:
|
|
190
|
+
return ReferencedTable(
|
|
191
|
+
warehouse=arguments[0],
|
|
192
|
+
database=table_detail["Database"],
|
|
193
|
+
schema=table_detail["Schema"],
|
|
194
|
+
table=table_detail.get("Table") or table_detail["View"],
|
|
195
|
+
catalog=None,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
def parse_custom_sql(
|
|
201
|
+
self, query: str, server: str, database: Optional[str], schema: Optional[str]
|
|
202
|
+
) -> Lineage:
|
|
203
|
+
dataplatform_tables: List[DataPlatformTable] = []
|
|
204
|
+
|
|
205
|
+
platform_detail: PlatformDetail = (
|
|
206
|
+
self.platform_instance_resolver.get_platform_instance(
|
|
207
|
+
PowerBIPlatformDetail(
|
|
208
|
+
data_platform_pair=self.get_platform_pair(),
|
|
209
|
+
data_platform_server=server,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
query = native_sql_parser.remove_drop_statement(
|
|
215
|
+
native_sql_parser.remove_special_characters(query)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
parsed_result: Optional[
|
|
219
|
+
"SqlParsingResult"
|
|
220
|
+
] = native_sql_parser.parse_custom_sql(
|
|
221
|
+
ctx=self.ctx,
|
|
222
|
+
query=query,
|
|
223
|
+
platform=self.get_platform_pair().datahub_data_platform_name,
|
|
224
|
+
platform_instance=platform_detail.platform_instance,
|
|
225
|
+
env=platform_detail.env,
|
|
226
|
+
database=database,
|
|
227
|
+
schema=schema,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if parsed_result is None:
|
|
231
|
+
self.reporter.info(
|
|
232
|
+
title=Constant.SQL_PARSING_FAILURE,
|
|
233
|
+
message="Fail to parse native sql present in PowerBI M-Query",
|
|
234
|
+
context=f"table-name={self.table.full_name}, sql={query}",
|
|
235
|
+
)
|
|
236
|
+
return Lineage.empty()
|
|
237
|
+
|
|
238
|
+
if parsed_result.debug_info and parsed_result.debug_info.table_error:
|
|
239
|
+
self.reporter.warning(
|
|
240
|
+
title=Constant.SQL_PARSING_FAILURE,
|
|
241
|
+
message="Fail to parse native sql present in PowerBI M-Query",
|
|
242
|
+
context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
|
|
243
|
+
)
|
|
244
|
+
return Lineage.empty()
|
|
245
|
+
|
|
246
|
+
for urn in parsed_result.in_tables:
|
|
247
|
+
dataplatform_tables.append(
|
|
248
|
+
DataPlatformTable(
|
|
249
|
+
data_platform_pair=self.get_platform_pair(),
|
|
250
|
+
urn=urn,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
logger.debug(f"Native Query parsed result={parsed_result}")
|
|
255
|
+
logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
|
|
256
|
+
|
|
257
|
+
return Lineage(
|
|
258
|
+
upstreams=dataplatform_tables,
|
|
259
|
+
column_lineage=(
|
|
260
|
+
parsed_result.column_lineage
|
|
261
|
+
if parsed_result.column_lineage is not None
|
|
262
|
+
else []
|
|
263
|
+
),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class AmazonRedshiftLineage(AbstractLineage):
|
|
268
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
269
|
+
return SupportedDataPlatform.AMAZON_REDSHIFT.value
|
|
270
|
+
|
|
271
|
+
def create_lineage(
|
|
272
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
273
|
+
) -> Lineage:
|
|
274
|
+
logger.debug(
|
|
275
|
+
f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
server, db_name = self.get_db_detail_from_argument(
|
|
279
|
+
data_access_func_detail.arg_list
|
|
280
|
+
)
|
|
281
|
+
if db_name is None or server is None:
|
|
282
|
+
return Lineage.empty() # Return an empty list
|
|
283
|
+
|
|
284
|
+
schema_name: str = cast(
|
|
285
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
286
|
+
).items["Name"]
|
|
287
|
+
|
|
288
|
+
table_name: str = cast(
|
|
289
|
+
IdentifierAccessor,
|
|
290
|
+
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
|
|
291
|
+
).items["Name"]
|
|
292
|
+
|
|
293
|
+
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
294
|
+
|
|
295
|
+
urn = make_urn(
|
|
296
|
+
config=self.config,
|
|
297
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
298
|
+
data_platform_pair=self.get_platform_pair(),
|
|
299
|
+
server=server,
|
|
300
|
+
qualified_table_name=qualified_table_name,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return Lineage(
|
|
304
|
+
upstreams=[
|
|
305
|
+
DataPlatformTable(
|
|
306
|
+
data_platform_pair=self.get_platform_pair(),
|
|
307
|
+
urn=urn,
|
|
308
|
+
)
|
|
309
|
+
],
|
|
310
|
+
column_lineage=[],
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class OracleLineage(AbstractLineage):
|
|
315
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
316
|
+
return SupportedDataPlatform.ORACLE.value
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
|
|
320
|
+
error_message: str = (
|
|
321
|
+
f"The target argument ({value}) should in the format of <host-name>:<port>/<db-name>["
|
|
322
|
+
".<domain>]"
|
|
323
|
+
)
|
|
324
|
+
splitter_result: List[str] = value.split("/")
|
|
325
|
+
if len(splitter_result) != 2:
|
|
326
|
+
logger.debug(error_message)
|
|
327
|
+
return None, None
|
|
328
|
+
|
|
329
|
+
db_name = splitter_result[1].split(".")[0]
|
|
330
|
+
|
|
331
|
+
return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
|
|
332
|
+
|
|
333
|
+
def create_lineage(
|
|
334
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
335
|
+
) -> Lineage:
|
|
336
|
+
logger.debug(
|
|
337
|
+
f"Processing Oracle data-access function detail {data_access_func_detail}"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
arguments: List[str] = tree_function.remove_whitespaces_from_list(
|
|
341
|
+
tree_function.token_values(data_access_func_detail.arg_list)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
server, db_name = self._get_server_and_db_name(arguments[0])
|
|
345
|
+
|
|
346
|
+
if db_name is None or server is None:
|
|
347
|
+
return Lineage.empty()
|
|
348
|
+
|
|
349
|
+
schema_name: str = cast(
|
|
350
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
351
|
+
).items["Schema"]
|
|
352
|
+
|
|
353
|
+
table_name: str = cast(
|
|
354
|
+
IdentifierAccessor,
|
|
355
|
+
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
|
|
356
|
+
).items["Name"]
|
|
357
|
+
|
|
358
|
+
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
359
|
+
|
|
360
|
+
urn = make_urn(
|
|
361
|
+
config=self.config,
|
|
362
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
363
|
+
data_platform_pair=self.get_platform_pair(),
|
|
364
|
+
server=server,
|
|
365
|
+
qualified_table_name=qualified_table_name,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return Lineage(
|
|
369
|
+
upstreams=[
|
|
370
|
+
DataPlatformTable(
|
|
371
|
+
data_platform_pair=self.get_platform_pair(),
|
|
372
|
+
urn=urn,
|
|
373
|
+
)
|
|
374
|
+
],
|
|
375
|
+
column_lineage=[],
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class DatabricksLineage(AbstractLineage):
|
|
380
|
+
def form_qualified_table_name(
|
|
381
|
+
self,
|
|
382
|
+
table_reference: ReferencedTable,
|
|
383
|
+
data_platform_pair: DataPlatformPair,
|
|
384
|
+
) -> str:
|
|
385
|
+
platform_detail: PlatformDetail = (
|
|
386
|
+
self.platform_instance_resolver.get_platform_instance(
|
|
387
|
+
PowerBIPlatformDetail(
|
|
388
|
+
data_platform_pair=data_platform_pair,
|
|
389
|
+
data_platform_server=table_reference.warehouse,
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
metastore: Optional[str] = None
|
|
395
|
+
|
|
396
|
+
qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
|
|
397
|
+
|
|
398
|
+
if isinstance(platform_detail, DataBricksPlatformDetail):
|
|
399
|
+
metastore = platform_detail.metastore
|
|
400
|
+
|
|
401
|
+
if metastore is not None:
|
|
402
|
+
return f"{metastore}.{qualified_table_name}"
|
|
403
|
+
|
|
404
|
+
return qualified_table_name
|
|
405
|
+
|
|
406
|
+
def create_lineage(
|
|
407
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
408
|
+
) -> Lineage:
|
|
409
|
+
logger.debug(
|
|
410
|
+
f"Processing Databrick data-access function detail {data_access_func_detail}"
|
|
411
|
+
)
|
|
412
|
+
table_detail: Dict[str, str] = {}
|
|
413
|
+
temp_accessor: Optional[
|
|
414
|
+
IdentifierAccessor
|
|
415
|
+
] = data_access_func_detail.identifier_accessor
|
|
416
|
+
|
|
417
|
+
while temp_accessor:
|
|
418
|
+
# Condition to handle databricks M-query pattern where table, schema and database all are present in
|
|
419
|
+
# the same invoke statement
|
|
420
|
+
if all(
|
|
421
|
+
element in temp_accessor.items
|
|
422
|
+
for element in ["Item", "Schema", "Catalog"]
|
|
423
|
+
):
|
|
424
|
+
table_detail["Schema"] = temp_accessor.items["Schema"]
|
|
425
|
+
table_detail["Table"] = temp_accessor.items["Item"]
|
|
426
|
+
else:
|
|
427
|
+
table_detail[temp_accessor.items["Kind"]] = temp_accessor.items["Name"]
|
|
428
|
+
|
|
429
|
+
if temp_accessor.next is not None:
|
|
430
|
+
temp_accessor = temp_accessor.next
|
|
431
|
+
else:
|
|
432
|
+
break
|
|
433
|
+
|
|
434
|
+
table_reference = self.create_reference_table(
|
|
435
|
+
arg_list=data_access_func_detail.arg_list,
|
|
436
|
+
table_detail=table_detail,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if table_reference:
|
|
440
|
+
qualified_table_name: str = self.form_qualified_table_name(
|
|
441
|
+
table_reference=table_reference,
|
|
442
|
+
data_platform_pair=self.get_platform_pair(),
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
urn = make_urn(
|
|
446
|
+
config=self.config,
|
|
447
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
448
|
+
data_platform_pair=self.get_platform_pair(),
|
|
449
|
+
server=table_reference.warehouse,
|
|
450
|
+
qualified_table_name=qualified_table_name,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return Lineage(
|
|
454
|
+
upstreams=[
|
|
455
|
+
DataPlatformTable(
|
|
456
|
+
data_platform_pair=self.get_platform_pair(),
|
|
457
|
+
urn=urn,
|
|
458
|
+
)
|
|
459
|
+
],
|
|
460
|
+
column_lineage=[],
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
return Lineage.empty()
|
|
464
|
+
|
|
465
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
466
|
+
return SupportedDataPlatform.DATABRICKS_SQL.value
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class TwoStepDataAccessPattern(AbstractLineage, ABC):
|
|
470
|
+
"""
|
|
471
|
+
These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern
|
|
472
|
+
let
|
|
473
|
+
Source = Sql.Database("localhost", "library"),
|
|
474
|
+
dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
|
|
475
|
+
in
|
|
476
|
+
dbo_book_issue
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
def two_level_access_pattern(
|
|
480
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
481
|
+
) -> Lineage:
|
|
482
|
+
logger.debug(
|
|
483
|
+
f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
server, db_name = self.get_db_detail_from_argument(
|
|
487
|
+
data_access_func_detail.arg_list
|
|
488
|
+
)
|
|
489
|
+
if server is None or db_name is None:
|
|
490
|
+
return Lineage.empty() # Return an empty list
|
|
491
|
+
|
|
492
|
+
schema_name: str = cast(
|
|
493
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
494
|
+
).items["Schema"]
|
|
495
|
+
|
|
496
|
+
table_name: str = cast(
|
|
497
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
498
|
+
).items["Item"]
|
|
499
|
+
|
|
500
|
+
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
501
|
+
|
|
502
|
+
logger.debug(
|
|
503
|
+
f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
urn = make_urn(
|
|
507
|
+
config=self.config,
|
|
508
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
509
|
+
data_platform_pair=self.get_platform_pair(),
|
|
510
|
+
server=server,
|
|
511
|
+
qualified_table_name=qualified_table_name,
|
|
512
|
+
)
|
|
513
|
+
return Lineage(
|
|
514
|
+
upstreams=[
|
|
515
|
+
DataPlatformTable(
|
|
516
|
+
data_platform_pair=self.get_platform_pair(),
|
|
517
|
+
urn=urn,
|
|
518
|
+
)
|
|
519
|
+
],
|
|
520
|
+
column_lineage=[],
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class PostgresLineage(TwoStepDataAccessPattern):
|
|
525
|
+
def create_lineage(
|
|
526
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
527
|
+
) -> Lineage:
|
|
528
|
+
return self.two_level_access_pattern(data_access_func_detail)
|
|
529
|
+
|
|
530
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
531
|
+
return SupportedDataPlatform.POSTGRES_SQL.value
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class MSSqlLineage(TwoStepDataAccessPattern):
|
|
535
|
+
# https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
|
|
536
|
+
DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
|
|
537
|
+
|
|
538
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
539
|
+
return SupportedDataPlatform.MS_SQL.value
|
|
540
|
+
|
|
541
|
+
def create_urn_using_old_parser(
|
|
542
|
+
self, query: str, db_name: str, server: str
|
|
543
|
+
) -> List[DataPlatformTable]:
|
|
544
|
+
dataplatform_tables: List[DataPlatformTable] = []
|
|
545
|
+
|
|
546
|
+
tables: List[str] = native_sql_parser.get_tables(query)
|
|
547
|
+
|
|
548
|
+
for parsed_table in tables:
|
|
549
|
+
# components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
|
|
550
|
+
components = [v.strip("[]") for v in parsed_table.split(".")]
|
|
551
|
+
if len(components) == 3:
|
|
552
|
+
database, schema, table = components
|
|
553
|
+
elif len(components) == 2:
|
|
554
|
+
schema, table = components
|
|
555
|
+
database = db_name
|
|
556
|
+
elif len(components) == 1:
|
|
557
|
+
(table,) = components
|
|
558
|
+
database = db_name
|
|
559
|
+
schema = MSSqlLineage.DEFAULT_SCHEMA
|
|
560
|
+
else:
|
|
561
|
+
self.reporter.warning(
|
|
562
|
+
title="Invalid table format",
|
|
563
|
+
message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as <db-name>.<schema-name>.<table-name> in the SQL.",
|
|
564
|
+
context=f"table-name={self.table.full_name}",
|
|
565
|
+
)
|
|
566
|
+
continue
|
|
567
|
+
|
|
568
|
+
qualified_table_name = f"{database}.{schema}.{table}"
|
|
569
|
+
urn = make_urn(
|
|
570
|
+
config=self.config,
|
|
571
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
572
|
+
data_platform_pair=self.get_platform_pair(),
|
|
573
|
+
server=server,
|
|
574
|
+
qualified_table_name=qualified_table_name,
|
|
575
|
+
)
|
|
576
|
+
dataplatform_tables.append(
|
|
577
|
+
DataPlatformTable(
|
|
578
|
+
data_platform_pair=self.get_platform_pair(),
|
|
579
|
+
urn=urn,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
logger.debug(f"Generated upstream tables = {dataplatform_tables}")
|
|
584
|
+
|
|
585
|
+
return dataplatform_tables
|
|
586
|
+
|
|
587
|
+
def create_lineage(
|
|
588
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
589
|
+
) -> Lineage:
|
|
590
|
+
arguments: List[str] = tree_function.strip_char_from_list(
|
|
591
|
+
values=tree_function.remove_whitespaces_from_list(
|
|
592
|
+
tree_function.token_values(data_access_func_detail.arg_list)
|
|
593
|
+
),
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
server, database = self.get_db_detail_from_argument(
|
|
597
|
+
data_access_func_detail.arg_list
|
|
598
|
+
)
|
|
599
|
+
if server is None or database is None:
|
|
600
|
+
return Lineage.empty() # Return an empty list
|
|
601
|
+
|
|
602
|
+
assert server
|
|
603
|
+
assert database # to silent the lint
|
|
604
|
+
|
|
605
|
+
query: Optional[str] = get_next_item(arguments, "Query")
|
|
606
|
+
if query:
|
|
607
|
+
if self.config.enable_advance_lineage_sql_construct is False:
|
|
608
|
+
# Use previous parser to generate URN to keep backward compatibility
|
|
609
|
+
return Lineage(
|
|
610
|
+
upstreams=self.create_urn_using_old_parser(
|
|
611
|
+
query=query,
|
|
612
|
+
db_name=database,
|
|
613
|
+
server=server,
|
|
614
|
+
),
|
|
615
|
+
column_lineage=[],
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
return self.parse_custom_sql(
|
|
619
|
+
query=query,
|
|
620
|
+
database=database,
|
|
621
|
+
server=server,
|
|
622
|
+
schema=MSSqlLineage.DEFAULT_SCHEMA,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# It is a regular case of MS-SQL
|
|
626
|
+
logger.debug("Handling with regular case")
|
|
627
|
+
return self.two_level_access_pattern(data_access_func_detail)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
631
|
+
def get_datasource_server(
|
|
632
|
+
self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
|
|
633
|
+
) -> str:
|
|
634
|
+
return tree_function.strip_char_from_list([arguments[0]])[0]
|
|
635
|
+
|
|
636
|
+
def create_lineage(
|
|
637
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
638
|
+
) -> Lineage:
|
|
639
|
+
logger.debug(
|
|
640
|
+
f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
arguments: List[str] = tree_function.remove_whitespaces_from_list(
|
|
644
|
+
tree_function.token_values(data_access_func_detail.arg_list)
|
|
645
|
+
)
|
|
646
|
+
# First is database name
|
|
647
|
+
db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
|
|
648
|
+
# Second is schema name
|
|
649
|
+
schema_name: str = cast(
|
|
650
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
|
|
651
|
+
).items["Name"]
|
|
652
|
+
# Third is table name
|
|
653
|
+
table_name: str = cast(
|
|
654
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
|
|
655
|
+
).items["Name"]
|
|
656
|
+
|
|
657
|
+
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
658
|
+
|
|
659
|
+
logger.debug(
|
|
660
|
+
f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
server: str = self.get_datasource_server(arguments, data_access_func_detail)
|
|
664
|
+
|
|
665
|
+
urn = make_urn(
|
|
666
|
+
config=self.config,
|
|
667
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
668
|
+
data_platform_pair=self.get_platform_pair(),
|
|
669
|
+
server=server,
|
|
670
|
+
qualified_table_name=qualified_table_name,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
return Lineage(
|
|
674
|
+
upstreams=[
|
|
675
|
+
DataPlatformTable(
|
|
676
|
+
data_platform_pair=self.get_platform_pair(),
|
|
677
|
+
urn=urn,
|
|
678
|
+
)
|
|
679
|
+
],
|
|
680
|
+
column_lineage=[],
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
class SnowflakeLineage(ThreeStepDataAccessPattern):
|
|
685
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
686
|
+
return SupportedDataPlatform.SNOWFLAKE.value
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
class GoogleBigQueryLineage(ThreeStepDataAccessPattern):
|
|
690
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
691
|
+
return SupportedDataPlatform.GOOGLE_BIGQUERY.value
|
|
692
|
+
|
|
693
|
+
def get_datasource_server(
|
|
694
|
+
self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
|
|
695
|
+
) -> str:
|
|
696
|
+
# In Google BigQuery server is project-name
|
|
697
|
+
# condition to silent lint, it is not going to be None
|
|
698
|
+
return (
|
|
699
|
+
data_access_func_detail.identifier_accessor.items["Name"]
|
|
700
|
+
if data_access_func_detail.identifier_accessor is not None
|
|
701
|
+
else ""
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
class NativeQueryLineage(AbstractLineage):
|
|
706
|
+
SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
|
|
707
|
+
SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
|
|
708
|
+
SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
|
|
709
|
+
SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
|
|
710
|
+
}
|
|
711
|
+
current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
|
|
712
|
+
|
|
713
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
714
|
+
return self.current_data_platform.value
|
|
715
|
+
|
|
716
|
+
@staticmethod
|
|
717
|
+
def is_native_parsing_supported(data_access_function_name: str) -> bool:
|
|
718
|
+
return (
|
|
719
|
+
data_access_function_name
|
|
720
|
+
in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
|
|
724
|
+
dataplatform_tables: List[DataPlatformTable] = []
|
|
725
|
+
|
|
726
|
+
tables: List[str] = native_sql_parser.get_tables(query)
|
|
727
|
+
|
|
728
|
+
for qualified_table_name in tables:
|
|
729
|
+
if len(qualified_table_name.split(".")) != 3:
|
|
730
|
+
logger.debug(
|
|
731
|
+
f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
|
|
732
|
+
)
|
|
733
|
+
continue
|
|
734
|
+
|
|
735
|
+
urn = make_urn(
|
|
736
|
+
config=self.config,
|
|
737
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
738
|
+
data_platform_pair=self.get_platform_pair(),
|
|
739
|
+
server=server,
|
|
740
|
+
qualified_table_name=qualified_table_name,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
dataplatform_tables.append(
|
|
744
|
+
DataPlatformTable(
|
|
745
|
+
data_platform_pair=self.get_platform_pair(),
|
|
746
|
+
urn=urn,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
|
|
751
|
+
|
|
752
|
+
return Lineage(
|
|
753
|
+
upstreams=dataplatform_tables,
|
|
754
|
+
column_lineage=[],
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
|
|
758
|
+
if (
|
|
759
|
+
data_access_tokens[0]
|
|
760
|
+
!= SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
|
|
761
|
+
):
|
|
762
|
+
return None
|
|
763
|
+
|
|
764
|
+
database: Optional[str] = get_next_item(data_access_tokens, "Database")
|
|
765
|
+
|
|
766
|
+
if (
|
|
767
|
+
database and database != Constant.M_QUERY_NULL
|
|
768
|
+
): # database name is explicitly set
|
|
769
|
+
return database
|
|
770
|
+
|
|
771
|
+
return get_next_item( # database name is set in Name argument
|
|
772
|
+
data_access_tokens, "Name"
|
|
773
|
+
) or get_next_item( # If both above arguments are not available, then try Catalog
|
|
774
|
+
data_access_tokens, "Catalog"
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
def create_lineage(
|
|
778
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
779
|
+
) -> Lineage:
|
|
780
|
+
t1: Optional[Tree] = tree_function.first_arg_list_func(
|
|
781
|
+
data_access_func_detail.arg_list
|
|
782
|
+
)
|
|
783
|
+
assert t1 is not None
|
|
784
|
+
flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
|
|
785
|
+
|
|
786
|
+
if len(flat_argument_list) != 2:
|
|
787
|
+
logger.debug(
|
|
788
|
+
f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
|
|
789
|
+
)
|
|
790
|
+
logger.debug(f"Flat argument list = {flat_argument_list}")
|
|
791
|
+
return Lineage.empty()
|
|
792
|
+
|
|
793
|
+
data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
|
|
794
|
+
tree_function.token_values(flat_argument_list[0])
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
if not self.is_native_parsing_supported(data_access_tokens[0]):
|
|
798
|
+
logger.debug(
|
|
799
|
+
f"Unsupported native-query data-platform = {data_access_tokens[0]}"
|
|
800
|
+
)
|
|
801
|
+
logger.debug(
|
|
802
|
+
f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
return Lineage.empty()
|
|
806
|
+
|
|
807
|
+
if len(data_access_tokens[0]) < 3:
|
|
808
|
+
logger.debug(
|
|
809
|
+
f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
|
|
810
|
+
"list"
|
|
811
|
+
)
|
|
812
|
+
return Lineage.empty()
|
|
813
|
+
|
|
814
|
+
self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
|
|
815
|
+
data_access_tokens[0]
|
|
816
|
+
]
|
|
817
|
+
# The First argument is the query
|
|
818
|
+
sql_query: str = tree_function.strip_char_from_list(
|
|
819
|
+
values=tree_function.remove_whitespaces_from_list(
|
|
820
|
+
tree_function.token_values(flat_argument_list[1])
|
|
821
|
+
),
|
|
822
|
+
)[
|
|
823
|
+
0
|
|
824
|
+
] # Remove any whitespaces and double quotes character
|
|
825
|
+
|
|
826
|
+
server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
|
|
827
|
+
|
|
828
|
+
if self.config.enable_advance_lineage_sql_construct is False:
|
|
829
|
+
# Use previous parser to generate URN to keep backward compatibility
|
|
830
|
+
return self.create_urn_using_old_parser(
|
|
831
|
+
query=sql_query,
|
|
832
|
+
server=server,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
database_name: Optional[str] = self.get_db_name(data_access_tokens)
|
|
836
|
+
|
|
837
|
+
return self.parse_custom_sql(
|
|
838
|
+
query=sql_query,
|
|
839
|
+
server=server,
|
|
840
|
+
database=database_name,
|
|
841
|
+
schema=None,
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
class SupportedPattern(Enum):
|
|
846
|
+
DATABRICKS_QUERY = (
|
|
847
|
+
DatabricksLineage,
|
|
848
|
+
FunctionName.DATABRICK_DATA_ACCESS,
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
DATABRICKS_MULTI_CLOUD = (
|
|
852
|
+
DatabricksLineage,
|
|
853
|
+
FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
POSTGRES_SQL = (
|
|
857
|
+
PostgresLineage,
|
|
858
|
+
FunctionName.POSTGRESQL_DATA_ACCESS,
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
ORACLE = (
|
|
862
|
+
OracleLineage,
|
|
863
|
+
FunctionName.ORACLE_DATA_ACCESS,
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
SNOWFLAKE = (
|
|
867
|
+
SnowflakeLineage,
|
|
868
|
+
FunctionName.SNOWFLAKE_DATA_ACCESS,
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
MS_SQL = (
|
|
872
|
+
MSSqlLineage,
|
|
873
|
+
FunctionName.MSSQL_DATA_ACCESS,
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
GOOGLE_BIG_QUERY = (
|
|
877
|
+
GoogleBigQueryLineage,
|
|
878
|
+
FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
AMAZON_REDSHIFT = (
|
|
882
|
+
AmazonRedshiftLineage,
|
|
883
|
+
FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
NATIVE_QUERY = (
|
|
887
|
+
NativeQueryLineage,
|
|
888
|
+
FunctionName.NATIVE_QUERY,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
def handler(self) -> Type[AbstractLineage]:
|
|
892
|
+
return self.value[0]
|
|
893
|
+
|
|
894
|
+
def function_name(self) -> str:
|
|
895
|
+
return self.value[1].value
|
|
896
|
+
|
|
897
|
+
@staticmethod
|
|
898
|
+
def get_function_names() -> List[str]:
|
|
899
|
+
functions: List[str] = []
|
|
900
|
+
for supported_resolver in SupportedPattern:
|
|
901
|
+
functions.append(supported_resolver.function_name())
|
|
902
|
+
|
|
903
|
+
return functions
|
|
904
|
+
|
|
905
|
+
@staticmethod
|
|
906
|
+
def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]:
|
|
907
|
+
logger.debug(f"Looking for pattern-handler for {function_name}")
|
|
908
|
+
for supported_resolver in SupportedPattern:
|
|
909
|
+
if function_name == supported_resolver.function_name():
|
|
910
|
+
return supported_resolver
|
|
911
|
+
logger.debug(f"pattern-handler not found for function_name {function_name}")
|
|
912
|
+
return None
|