acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
- datahub_airflow_plugin/_airflow_shims.py +31 -64
- datahub_airflow_plugin/_config.py +19 -97
- datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
- datahub_airflow_plugin/_extractors.py +365 -0
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/client/airflow_generator.py +43 -147
- datahub_airflow_plugin/datahub_listener.py +790 -19
- datahub_airflow_plugin/example_dags/__init__.py +0 -32
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
- datahub_airflow_plugin/hooks/datahub.py +2 -11
- datahub_airflow_plugin/operators/datahub.py +3 -20
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
- datahub_airflow_plugin/_airflow_compat.py +0 -32
- datahub_airflow_plugin/_airflow_version_specific.py +0 -184
- datahub_airflow_plugin/_constants.py +0 -16
- datahub_airflow_plugin/airflow2/__init__.py +0 -6
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
- datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
- datahub_airflow_plugin/airflow2/_extractors.py +0 -477
- datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
- datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
- datahub_airflow_plugin/airflow2/_shims.py +0 -88
- datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
- datahub_airflow_plugin/airflow3/__init__.py +0 -6
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
- datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
- datahub_airflow_plugin/airflow3/_shims.py +0 -82
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
- datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,408 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Patch for Airflow 3.0+ SQLParser to use DataHub's SQL parser.
|
|
3
|
-
|
|
4
|
-
In Airflow 3.0+, SQL operators call SQLParser.generate_openlineage_metadata_from_sql()
|
|
5
|
-
directly rather than using extractors. This module patches that method to use DataHub's
|
|
6
|
-
SQL parser, which provides better column-level lineage support.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import logging
|
|
10
|
-
from types import TracebackType
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
12
|
-
|
|
13
|
-
# Airflow 3.x specific imports (wrapped in try/except for version compatibility)
|
|
14
|
-
try:
|
|
15
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
16
|
-
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
17
|
-
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
18
|
-
from openlineage.client.facet import SqlJobFacet
|
|
19
|
-
|
|
20
|
-
AIRFLOW3_IMPORTS_AVAILABLE = True
|
|
21
|
-
except ImportError:
|
|
22
|
-
# Not available on Airflow < 3.0
|
|
23
|
-
# Set to None for runtime checks, type checker will see these as None
|
|
24
|
-
OperatorLineage = None # type: ignore[assignment,misc]
|
|
25
|
-
DatabaseInfo = None # type: ignore[assignment,misc]
|
|
26
|
-
OpenLineageDataset = None # type: ignore[assignment,misc]
|
|
27
|
-
SqlJobFacet = None # type: ignore[assignment,misc]
|
|
28
|
-
AIRFLOW3_IMPORTS_AVAILABLE = False
|
|
29
|
-
|
|
30
|
-
# DataHub imports (always available)
|
|
31
|
-
import datahub.emitter.mce_builder as builder
|
|
32
|
-
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
33
|
-
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
34
|
-
from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
|
|
35
|
-
|
|
36
|
-
if TYPE_CHECKING:
|
|
37
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
38
|
-
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
39
|
-
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
40
|
-
from openlineage.client.facet import SqlJobFacet
|
|
41
|
-
|
|
42
|
-
logger = logging.getLogger(__name__)
|
|
43
|
-
|
|
44
|
-
# Store the original SQLParser method for fallback
|
|
45
|
-
_original_sql_parser_method: Optional[Callable[..., Any]] = None
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _datahub_generate_openlineage_metadata_from_sql(
|
|
49
|
-
self: Any,
|
|
50
|
-
sql: Any,
|
|
51
|
-
hook: Any,
|
|
52
|
-
database_info: dict,
|
|
53
|
-
database: Optional[str] = None,
|
|
54
|
-
sqlalchemy_engine: Optional[Any] = None,
|
|
55
|
-
use_connection: bool = True,
|
|
56
|
-
) -> Optional["OperatorLineage"]:
|
|
57
|
-
"""
|
|
58
|
-
Override SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
59
|
-
|
|
60
|
-
This is necessary because in Airflow 3.0+, SQL operators call SQLParser directly
|
|
61
|
-
rather than using extractors. We intercept this call and use DataHub's SQL parser
|
|
62
|
-
to generate lineage with column-level lineage support.
|
|
63
|
-
|
|
64
|
-
When OpenLineage plugin is enabled (disable_openlineage_plugin=False), we call both
|
|
65
|
-
parsers: OpenLineage gets its own parsing results, while DataHub's enhanced parsing
|
|
66
|
-
is stored in a custom facet for the DataHub listener to extract.
|
|
67
|
-
"""
|
|
68
|
-
try:
|
|
69
|
-
# Import here to avoid circular dependency (datahub_listener -> _airflow_compat -> this module)
|
|
70
|
-
from datahub_airflow_plugin._config import get_lineage_config
|
|
71
|
-
from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
|
|
72
|
-
|
|
73
|
-
# Check if OpenLineage plugin is enabled
|
|
74
|
-
try:
|
|
75
|
-
config = get_lineage_config()
|
|
76
|
-
openlineage_enabled = not config.disable_openlineage_plugin
|
|
77
|
-
except Exception as e:
|
|
78
|
-
logger.warning(
|
|
79
|
-
f"Could not load config to check disable_openlineage_plugin: {e}"
|
|
80
|
-
)
|
|
81
|
-
openlineage_enabled = False
|
|
82
|
-
|
|
83
|
-
# If OpenLineage is enabled, call the original parser first to get its results
|
|
84
|
-
ol_result = None
|
|
85
|
-
if openlineage_enabled and _original_sql_parser_method is not None:
|
|
86
|
-
try:
|
|
87
|
-
logger.debug(
|
|
88
|
-
"OpenLineage plugin enabled - calling original parser for OpenLineage"
|
|
89
|
-
)
|
|
90
|
-
ol_result = _original_sql_parser_method(
|
|
91
|
-
self,
|
|
92
|
-
sql,
|
|
93
|
-
hook,
|
|
94
|
-
database_info,
|
|
95
|
-
database,
|
|
96
|
-
sqlalchemy_engine,
|
|
97
|
-
use_connection,
|
|
98
|
-
)
|
|
99
|
-
logger.debug(f"OpenLineage parser result: {ol_result}")
|
|
100
|
-
except Exception as e:
|
|
101
|
-
logger.warning(
|
|
102
|
-
f"Error calling original OpenLineage parser, will use only DataHub parser: {e}",
|
|
103
|
-
exc_info=True,
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
# Handle missing database_info by creating a minimal one from connection
|
|
107
|
-
if database_info is None:
|
|
108
|
-
# Get basic properties from hook's connection
|
|
109
|
-
conn = getattr(hook, "get_connection", lambda: None)()
|
|
110
|
-
scheme = getattr(conn, "conn_type", None) if conn else None
|
|
111
|
-
db_name = getattr(conn, "schema", None) if conn else None
|
|
112
|
-
|
|
113
|
-
database_info = DatabaseInfo(
|
|
114
|
-
scheme=scheme,
|
|
115
|
-
authority=None,
|
|
116
|
-
database=db_name,
|
|
117
|
-
information_schema_columns=[],
|
|
118
|
-
information_schema_table_name="",
|
|
119
|
-
use_flat_cross_db_query=False,
|
|
120
|
-
is_information_schema_cross_db=False,
|
|
121
|
-
is_uppercase_names=False,
|
|
122
|
-
normalize_name_method=lambda x: x.lower(),
|
|
123
|
-
)
|
|
124
|
-
logger.debug(
|
|
125
|
-
f"Created minimal DatabaseInfo from connection: scheme={scheme}, database={db_name}"
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
# Get platform from dialect or from database_info scheme
|
|
129
|
-
# If dialect is "generic", prefer database_info.scheme (connection type)
|
|
130
|
-
platform = self.dialect or "sql"
|
|
131
|
-
if platform == "generic" and database_info:
|
|
132
|
-
# Use the actual connection type instead of "generic"
|
|
133
|
-
platform = getattr(database_info, "scheme", platform) or platform
|
|
134
|
-
if platform == "generic":
|
|
135
|
-
raise ValueError(
|
|
136
|
-
"Could not determine platform from generic dialect or database_info"
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
platform = OL_SCHEME_TWEAKS.get(platform, platform)
|
|
140
|
-
|
|
141
|
-
# Get default database and schema
|
|
142
|
-
# database_info is a DatabaseInfo object (dataclass/namedtuple), not a dict
|
|
143
|
-
default_database = database or getattr(database_info, "database", None)
|
|
144
|
-
default_schema = self.default_schema
|
|
145
|
-
|
|
146
|
-
# Handle list of SQL statements
|
|
147
|
-
if isinstance(sql, list):
|
|
148
|
-
logger.debug("Got list of SQL statements. Using first one for parsing.")
|
|
149
|
-
sql = sql[0] if sql else ""
|
|
150
|
-
|
|
151
|
-
# Check if SQL still contains templates (should be rendered by operator)
|
|
152
|
-
if "{{" in str(sql):
|
|
153
|
-
logger.warning(
|
|
154
|
-
f"SQL still contains Jinja templates - lineage extraction may fail. "
|
|
155
|
-
f"SQL: {sql[:200]}... "
|
|
156
|
-
f"This usually means templates weren't rendered before SQL parsing."
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
# Run DataHub's SQL parser
|
|
160
|
-
listener = get_airflow_plugin_listener()
|
|
161
|
-
graph = listener.graph if listener else None
|
|
162
|
-
|
|
163
|
-
logger.debug(
|
|
164
|
-
"Running DataHub SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
|
|
165
|
-
"with graph client" if graph else "in offline mode",
|
|
166
|
-
platform,
|
|
167
|
-
default_database,
|
|
168
|
-
default_schema,
|
|
169
|
-
sql,
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
173
|
-
query=sql,
|
|
174
|
-
graph=graph,
|
|
175
|
-
platform=platform,
|
|
176
|
-
platform_instance=None,
|
|
177
|
-
env=builder.DEFAULT_ENV,
|
|
178
|
-
default_db=default_database,
|
|
179
|
-
default_schema=default_schema,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
logger.debug(f"DataHub SQL parser result: {sql_parsing_result}")
|
|
183
|
-
|
|
184
|
-
# Store the sql_parsing_result in run_facets for later retrieval by the DataHub listener
|
|
185
|
-
# If OpenLineage plugin is enabled and we got a result from the original parser,
|
|
186
|
-
# use OpenLineage's result but add DataHub's parsing to the facets
|
|
187
|
-
if ol_result is not None:
|
|
188
|
-
logger.debug(
|
|
189
|
-
"Using OpenLineage parser result for OperatorLineage, "
|
|
190
|
-
"adding DataHub parsing to run_facets"
|
|
191
|
-
)
|
|
192
|
-
# Add DataHub's SQL parsing result to the existing run_facets
|
|
193
|
-
# OperatorLineage is frozen (uses @define), so we need to create a new dict
|
|
194
|
-
updated_run_facets = dict(ol_result.run_facets or {})
|
|
195
|
-
updated_run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = sql_parsing_result
|
|
196
|
-
|
|
197
|
-
# Create new OperatorLineage with OpenLineage's inputs/outputs but DataHub's facet
|
|
198
|
-
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
199
|
-
inputs=ol_result.inputs,
|
|
200
|
-
outputs=ol_result.outputs,
|
|
201
|
-
job_facets=ol_result.job_facets,
|
|
202
|
-
run_facets=updated_run_facets,
|
|
203
|
-
)
|
|
204
|
-
return operator_lineage
|
|
205
|
-
|
|
206
|
-
# OpenLineage is disabled or original parser failed - use DataHub's parsing for everything
|
|
207
|
-
logger.debug(
|
|
208
|
-
"OpenLineage plugin disabled or parser unavailable - "
|
|
209
|
-
"using DataHub parser result for OperatorLineage"
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
# Convert DataHub URNs to OpenLineage Dataset objects
|
|
213
|
-
def _urn_to_ol_dataset(urn: str) -> "OpenLineageDataset":
|
|
214
|
-
"""Convert DataHub URN to OpenLineage Dataset format."""
|
|
215
|
-
# Parse URN to extract database, schema, table
|
|
216
|
-
# URN format: urn:li:dataset:(urn:li:dataPlatform:{platform},{database}.{schema}.{table},{env})
|
|
217
|
-
try:
|
|
218
|
-
parts = urn.split(",")
|
|
219
|
-
if len(parts) >= 2:
|
|
220
|
-
# Extract table path from URN
|
|
221
|
-
table_path = parts[1] # e.g., "database.schema.table"
|
|
222
|
-
|
|
223
|
-
# Create OL namespace and name
|
|
224
|
-
# For now, use platform as namespace and full path as name
|
|
225
|
-
namespace = f"{platform}://{default_database or 'default'}"
|
|
226
|
-
name = table_path
|
|
227
|
-
|
|
228
|
-
return OpenLineageDataset(namespace=namespace, name=name)
|
|
229
|
-
except Exception as e:
|
|
230
|
-
logger.debug(f"Error converting URN {urn} to OL Dataset: {e}")
|
|
231
|
-
|
|
232
|
-
# Fallback: use URN as name
|
|
233
|
-
return OpenLineageDataset(namespace=f"{platform}://default", name=urn)
|
|
234
|
-
|
|
235
|
-
inputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.in_tables]
|
|
236
|
-
outputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.out_tables]
|
|
237
|
-
|
|
238
|
-
run_facets = {DATAHUB_SQL_PARSING_RESULT_KEY: sql_parsing_result}
|
|
239
|
-
|
|
240
|
-
# Create OperatorLineage with DataHub's results
|
|
241
|
-
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
242
|
-
inputs=inputs,
|
|
243
|
-
outputs=outputs,
|
|
244
|
-
job_facets={"sql": SqlJobFacet(query=sql)},
|
|
245
|
-
run_facets=run_facets,
|
|
246
|
-
)
|
|
247
|
-
return operator_lineage
|
|
248
|
-
|
|
249
|
-
except Exception as e:
|
|
250
|
-
logger.warning(
|
|
251
|
-
f"Error in DataHub SQL parser, falling back to default OpenLineage parser: {e}",
|
|
252
|
-
exc_info=True,
|
|
253
|
-
)
|
|
254
|
-
# Fall back to original implementation
|
|
255
|
-
if _original_sql_parser_method is None:
|
|
256
|
-
raise RuntimeError(
|
|
257
|
-
"Original SQLParser method not stored. patch_sqlparser() may not have been called."
|
|
258
|
-
) from None
|
|
259
|
-
return _original_sql_parser_method(
|
|
260
|
-
self, sql, hook, database_info, database, sqlalchemy_engine, use_connection
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
class SQLParserPatch:
|
|
265
|
-
"""
|
|
266
|
-
Context manager for patching Airflow's SQLParser with DataHub's SQL parser.
|
|
267
|
-
|
|
268
|
-
This class encapsulates the patching logic and manages the global state properly.
|
|
269
|
-
It can be used as a context manager for automatic cleanup, or with explicit
|
|
270
|
-
patch/unpatch methods for manual control.
|
|
271
|
-
|
|
272
|
-
Usage:
|
|
273
|
-
# As a context manager (recommended for testing)
|
|
274
|
-
with SQLParserPatch():
|
|
275
|
-
# Code runs with patched SQLParser
|
|
276
|
-
pass
|
|
277
|
-
# Automatically unpatched on exit
|
|
278
|
-
|
|
279
|
-
# Or with explicit control
|
|
280
|
-
patcher = SQLParserPatch()
|
|
281
|
-
patcher.patch()
|
|
282
|
-
try:
|
|
283
|
-
# ... plugin lifetime ...
|
|
284
|
-
finally:
|
|
285
|
-
patcher.unpatch()
|
|
286
|
-
|
|
287
|
-
The patch stores the original SQLParser method and replaces it with DataHub's
|
|
288
|
-
enhanced implementation that provides column-level lineage support.
|
|
289
|
-
"""
|
|
290
|
-
|
|
291
|
-
def patch(self) -> "SQLParserPatch":
|
|
292
|
-
"""
|
|
293
|
-
Apply the SQLParser patch.
|
|
294
|
-
|
|
295
|
-
Stores the original SQLParser.generate_openlineage_metadata_from_sql method
|
|
296
|
-
and replaces it with DataHub's enhanced implementation.
|
|
297
|
-
|
|
298
|
-
Returns:
|
|
299
|
-
self for method chaining
|
|
300
|
-
"""
|
|
301
|
-
global _original_sql_parser_method
|
|
302
|
-
|
|
303
|
-
try:
|
|
304
|
-
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
305
|
-
|
|
306
|
-
# Store original method for fallback (only if not already patched)
|
|
307
|
-
if _original_sql_parser_method is None:
|
|
308
|
-
_original_sql_parser_method = (
|
|
309
|
-
SQLParser.generate_openlineage_metadata_from_sql
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[assignment,method-assign]
|
|
313
|
-
_datahub_generate_openlineage_metadata_from_sql # type: ignore[assignment,method-assign]
|
|
314
|
-
)
|
|
315
|
-
logger.debug(
|
|
316
|
-
"Patched SQLParser.generate_openlineage_metadata_from_sql with DataHub SQL parser"
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
except ImportError:
|
|
320
|
-
# SQLParser not available (Airflow < 3.0 or openlineage provider not installed)
|
|
321
|
-
logger.debug(
|
|
322
|
-
"SQLParser not available, skipping patch (likely Airflow < 3.0)"
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
return self
|
|
326
|
-
|
|
327
|
-
def unpatch(self) -> "SQLParserPatch":
|
|
328
|
-
"""
|
|
329
|
-
Remove the SQLParser patch and restore the original method.
|
|
330
|
-
|
|
331
|
-
This is primarily useful for testing to ensure clean state between tests.
|
|
332
|
-
In production, the patch typically stays active for the process lifetime.
|
|
333
|
-
|
|
334
|
-
Returns:
|
|
335
|
-
self for method chaining
|
|
336
|
-
"""
|
|
337
|
-
global _original_sql_parser_method
|
|
338
|
-
|
|
339
|
-
if _original_sql_parser_method is None:
|
|
340
|
-
logger.debug("SQLParser not patched, nothing to unpatch")
|
|
341
|
-
return self
|
|
342
|
-
|
|
343
|
-
try:
|
|
344
|
-
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
345
|
-
|
|
346
|
-
# Restore original method
|
|
347
|
-
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[method-assign]
|
|
348
|
-
_original_sql_parser_method
|
|
349
|
-
)
|
|
350
|
-
logger.debug("Unpatched SQLParser, restored original method")
|
|
351
|
-
|
|
352
|
-
except ImportError:
|
|
353
|
-
logger.debug("SQLParser not available, nothing to unpatch")
|
|
354
|
-
finally:
|
|
355
|
-
# Clear the stored reference to allow re-patching
|
|
356
|
-
_original_sql_parser_method = None
|
|
357
|
-
|
|
358
|
-
return self
|
|
359
|
-
|
|
360
|
-
def __enter__(self) -> "SQLParserPatch":
|
|
361
|
-
"""Context manager entry: apply the patch."""
|
|
362
|
-
return self.patch()
|
|
363
|
-
|
|
364
|
-
def __exit__(
|
|
365
|
-
self,
|
|
366
|
-
exc_type: Optional[type[BaseException]],
|
|
367
|
-
exc_val: Optional[BaseException],
|
|
368
|
-
exc_tb: Optional[TracebackType],
|
|
369
|
-
) -> None:
|
|
370
|
-
"""Context manager exit: remove the patch."""
|
|
371
|
-
self.unpatch()
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
# Global patcher instance for backward compatibility
|
|
375
|
-
_global_patcher = SQLParserPatch()
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
def patch_sqlparser() -> None:
|
|
379
|
-
"""
|
|
380
|
-
Patch SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
381
|
-
|
|
382
|
-
This is a convenience function that wraps SQLParserPatch.patch() for backward
|
|
383
|
-
compatibility with existing code.
|
|
384
|
-
|
|
385
|
-
This should be called early in the plugin initialization, before any SQL operators are used.
|
|
386
|
-
|
|
387
|
-
When both DataHub and OpenLineage plugins are enabled (disable_openlineage_plugin=False),
|
|
388
|
-
the patch calls BOTH parsers:
|
|
389
|
-
- OpenLineage's original parser provides inputs/outputs for OpenLineage plugin
|
|
390
|
-
- DataHub's enhanced parser (with column-level lineage) is stored in run_facets
|
|
391
|
-
for DataHub listener to extract
|
|
392
|
-
|
|
393
|
-
When only DataHub is enabled (disable_openlineage_plugin=True), only DataHub's
|
|
394
|
-
parser runs and provides both the OperatorLineage structure and the enhanced parsing.
|
|
395
|
-
"""
|
|
396
|
-
_global_patcher.patch()
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def unpatch_sqlparser() -> None:
|
|
400
|
-
"""
|
|
401
|
-
Remove the SQLParser patch and restore the original method.
|
|
402
|
-
|
|
403
|
-
This is a convenience function that wraps SQLParserPatch.unpatch() for consistency.
|
|
404
|
-
|
|
405
|
-
This is primarily useful for testing to ensure clean state between tests.
|
|
406
|
-
In production, the patch typically stays active for the process lifetime.
|
|
407
|
-
"""
|
|
408
|
-
_global_patcher.unpatch()
|
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
# Airflow 3.x compatibility module
|
|
2
|
-
# This module must be imported before any Airflow imports in any of our files.
|
|
3
|
-
|
|
4
|
-
import logging
|
|
5
|
-
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
|
6
|
-
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
# Critical safety check: Ensure MarkupSafe compatibility patch is applied
|
|
10
|
-
# This must happen before importing Airflow to prevent MarkupSafe version conflicts
|
|
11
|
-
# Using explicit exception instead of assert to ensure it runs even with python -O
|
|
12
|
-
if not MARKUPSAFE_PATCHED:
|
|
13
|
-
raise RuntimeError(
|
|
14
|
-
"MarkupSafe compatibility patch must be applied before importing Airflow modules. "
|
|
15
|
-
"This is a critical safety check that cannot be disabled. "
|
|
16
|
-
"The patch ensures compatibility between different MarkupSafe versions used by "
|
|
17
|
-
"Airflow and DataHub dependencies."
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
# Apply Airflow 3.x patches
|
|
21
|
-
# These imports must be after MARKUPSAFE_PATCHED assertion because they import Airflow modules.
|
|
22
|
-
# We need to ensure markupsafe is patched first to maintain compatibility.
|
|
23
|
-
|
|
24
|
-
# Load configuration to determine which patches to apply
|
|
25
|
-
try:
|
|
26
|
-
from datahub_airflow_plugin._config import get_lineage_config
|
|
27
|
-
|
|
28
|
-
config = get_lineage_config()
|
|
29
|
-
enable_extractors = config.enable_extractors
|
|
30
|
-
patch_sql_parser = config.patch_sql_parser
|
|
31
|
-
extract_athena_operator = config.extract_athena_operator
|
|
32
|
-
extract_bigquery_insert_job_operator = config.extract_bigquery_insert_job_operator
|
|
33
|
-
extract_teradata_operator = config.extract_teradata_operator
|
|
34
|
-
except Exception:
|
|
35
|
-
# If config loading fails, apply all patches by default (backward compatibility)
|
|
36
|
-
enable_extractors = True
|
|
37
|
-
patch_sql_parser = True
|
|
38
|
-
extract_athena_operator = True
|
|
39
|
-
extract_bigquery_insert_job_operator = True
|
|
40
|
-
extract_teradata_operator = True
|
|
41
|
-
|
|
42
|
-
# Only apply patches if extractors are enabled
|
|
43
|
-
if enable_extractors:
|
|
44
|
-
# Airflow 3.0+ SQLParser patch
|
|
45
|
-
if patch_sql_parser:
|
|
46
|
-
try:
|
|
47
|
-
from datahub_airflow_plugin.airflow3._airflow3_sql_parser_patch import (
|
|
48
|
-
patch_sqlparser,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
patch_sqlparser()
|
|
52
|
-
# Log success for debugging
|
|
53
|
-
logger.debug("✓ Successfully applied Airflow 3 SQL parser patch")
|
|
54
|
-
except ImportError as e:
|
|
55
|
-
# Not available when openlineage packages aren't installed
|
|
56
|
-
logger.warning(
|
|
57
|
-
f"SQL parser patch not applied - OpenLineage packages not available: {e}"
|
|
58
|
-
)
|
|
59
|
-
except Exception as e:
|
|
60
|
-
# Log any other errors
|
|
61
|
-
logger.warning(f"Failed to apply SQL parser patch: {e}", exc_info=True)
|
|
62
|
-
|
|
63
|
-
# Operator-specific patches (conditional based on config and operator availability)
|
|
64
|
-
# SQLite patch is always applied when available (no config flag yet)
|
|
65
|
-
try:
|
|
66
|
-
from datahub_airflow_plugin.airflow3._sqlite_openlineage_patch import (
|
|
67
|
-
patch_sqlite_hook,
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
patch_sqlite_hook()
|
|
71
|
-
except ImportError:
|
|
72
|
-
pass
|
|
73
|
-
|
|
74
|
-
if extract_athena_operator:
|
|
75
|
-
try:
|
|
76
|
-
from datahub_airflow_plugin.airflow3._athena_openlineage_patch import (
|
|
77
|
-
patch_athena_operator,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
patch_athena_operator()
|
|
81
|
-
except ImportError:
|
|
82
|
-
pass
|
|
83
|
-
|
|
84
|
-
if extract_bigquery_insert_job_operator:
|
|
85
|
-
try:
|
|
86
|
-
from datahub_airflow_plugin.airflow3._bigquery_openlineage_patch import (
|
|
87
|
-
patch_bigquery_insert_job_operator,
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
patch_bigquery_insert_job_operator()
|
|
91
|
-
except ImportError:
|
|
92
|
-
pass
|
|
93
|
-
|
|
94
|
-
if extract_teradata_operator:
|
|
95
|
-
try:
|
|
96
|
-
from datahub_airflow_plugin.airflow3._teradata_openlineage_patch import (
|
|
97
|
-
patch_teradata_operator,
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
patch_teradata_operator()
|
|
101
|
-
except ImportError:
|
|
102
|
-
pass
|
|
103
|
-
|
|
104
|
-
AIRFLOW_PATCHED = True
|
|
105
|
-
|
|
106
|
-
__all__ = [
|
|
107
|
-
"AIRFLOW_PATCHED",
|
|
108
|
-
]
|
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Patch for AthenaOperator to use DataHub's SQL parser.
|
|
3
|
-
|
|
4
|
-
AthenaOperator in Airflow 3.x uses SQLParser with dialect="generic", which doesn't provide
|
|
5
|
-
column-level lineage. This patch modifies get_openlineage_facets_on_complete() to use
|
|
6
|
-
DataHub's SQL parser instead, enabling column-level lineage extraction.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import logging
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
11
|
-
|
|
12
|
-
import datahub.emitter.mce_builder as builder
|
|
13
|
-
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
14
|
-
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from airflow.models.taskinstance import TaskInstance
|
|
18
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def patch_athena_operator() -> None:
|
|
24
|
-
"""
|
|
25
|
-
Patch AthenaOperator to use DataHub's SQL parser for lineage extraction.
|
|
26
|
-
|
|
27
|
-
This enhances the existing OpenLineage support with DataHub's SQL parser,
|
|
28
|
-
which provides better column-level lineage.
|
|
29
|
-
"""
|
|
30
|
-
try:
|
|
31
|
-
from airflow.providers.amazon.aws.operators.athena import AthenaOperator
|
|
32
|
-
|
|
33
|
-
# Check if the method exists (only in Airflow 3.x)
|
|
34
|
-
if not hasattr(AthenaOperator, "get_openlineage_facets_on_complete"):
|
|
35
|
-
logger.debug(
|
|
36
|
-
"AthenaOperator.get_openlineage_facets_on_complete not found - "
|
|
37
|
-
"likely Airflow 2.x, skipping patch"
|
|
38
|
-
)
|
|
39
|
-
return
|
|
40
|
-
|
|
41
|
-
# Check if already patched
|
|
42
|
-
if hasattr(AthenaOperator, "_datahub_openlineage_patched"):
|
|
43
|
-
logger.debug("AthenaOperator already patched for OpenLineage")
|
|
44
|
-
return
|
|
45
|
-
|
|
46
|
-
# Store original method
|
|
47
|
-
original_get_openlineage_facets_on_complete = (
|
|
48
|
-
AthenaOperator.get_openlineage_facets_on_complete
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def get_openlineage_facets_on_complete(
|
|
52
|
-
self: Any, task_instance: "TaskInstance"
|
|
53
|
-
) -> Optional["OperatorLineage"]:
|
|
54
|
-
"""
|
|
55
|
-
Enhanced version that uses DataHub's SQL parser for better lineage.
|
|
56
|
-
|
|
57
|
-
This method:
|
|
58
|
-
1. Calls the original OpenLineage implementation
|
|
59
|
-
2. Enhances it with DataHub SQL parsing result for column lineage
|
|
60
|
-
"""
|
|
61
|
-
try:
|
|
62
|
-
logger.debug(
|
|
63
|
-
f"DataHub patched Athena get_openlineage_facets_on_complete called for query: {self.query[:100]}"
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# Get the original OpenLineage result
|
|
67
|
-
operator_lineage = original_get_openlineage_facets_on_complete(
|
|
68
|
-
self, task_instance
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
if not operator_lineage:
|
|
72
|
-
logger.debug(
|
|
73
|
-
"Original OpenLineage returned None for Athena operator"
|
|
74
|
-
)
|
|
75
|
-
return operator_lineage
|
|
76
|
-
|
|
77
|
-
logger.debug(
|
|
78
|
-
f"Original Athena OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
# Check if SQL parsing result is already in run_facets (from SQLParser patch)
|
|
82
|
-
# If not, add it manually since Athena might not use SQLParser or patch might not work
|
|
83
|
-
if DATAHUB_SQL_PARSING_RESULT_KEY not in operator_lineage.run_facets:
|
|
84
|
-
# SQLParser patch didn't add it - add it manually
|
|
85
|
-
try:
|
|
86
|
-
platform = "athena"
|
|
87
|
-
default_database = (
|
|
88
|
-
self.database if hasattr(self, "database") else None
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# Get the SQL query - templates are already rendered by Airflow during task execution
|
|
92
|
-
rendered_query = self.query
|
|
93
|
-
|
|
94
|
-
logger.debug(
|
|
95
|
-
f"Running DataHub SQL parser for Athena (platform={platform}, "
|
|
96
|
-
f"default_db={default_database}): {rendered_query[:200] if rendered_query else 'None'}"
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Use DataHub's SQL parser
|
|
100
|
-
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
101
|
-
query=rendered_query,
|
|
102
|
-
platform=platform,
|
|
103
|
-
platform_instance=None,
|
|
104
|
-
env=builder.DEFAULT_ENV,
|
|
105
|
-
default_db=default_database,
|
|
106
|
-
default_schema=None,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Store the SQL parsing result in run_facets for DataHub listener
|
|
110
|
-
if sql_parsing_result:
|
|
111
|
-
operator_lineage.run_facets[
|
|
112
|
-
DATAHUB_SQL_PARSING_RESULT_KEY
|
|
113
|
-
] = sql_parsing_result
|
|
114
|
-
logger.debug(
|
|
115
|
-
f"Added DataHub SQL parsing result with "
|
|
116
|
-
f"{len(sql_parsing_result.column_lineage or [])} column lineages"
|
|
117
|
-
)
|
|
118
|
-
except Exception as e:
|
|
119
|
-
logger.warning(
|
|
120
|
-
f"Error running DataHub SQL parser for Athena: {e}",
|
|
121
|
-
exc_info=True,
|
|
122
|
-
)
|
|
123
|
-
else:
|
|
124
|
-
logger.debug(
|
|
125
|
-
f"DataHub SQL parsing result already present in run_facets "
|
|
126
|
-
f"(added by SQLParser patch) with "
|
|
127
|
-
f"{len(operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY].column_lineage or [])} column lineages"
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
return operator_lineage
|
|
131
|
-
|
|
132
|
-
except Exception as e:
|
|
133
|
-
logger.warning(
|
|
134
|
-
f"Error in patched AthenaOperator.get_openlineage_facets_on_complete: {e}",
|
|
135
|
-
exc_info=True,
|
|
136
|
-
)
|
|
137
|
-
# Fall back to original method
|
|
138
|
-
return original_get_openlineage_facets_on_complete(self, task_instance)
|
|
139
|
-
|
|
140
|
-
# Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
|
|
141
|
-
AthenaOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
|
|
142
|
-
get_openlineage_facets_on_complete # type: ignore[assignment]
|
|
143
|
-
)
|
|
144
|
-
AthenaOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
|
|
145
|
-
|
|
146
|
-
logger.debug(
|
|
147
|
-
"Patched AthenaOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
except ImportError as e:
|
|
151
|
-
logger.debug(
|
|
152
|
-
f"Could not patch AthenaOperator for OpenLineage (provider not installed or Airflow < 3.0): {e}"
|
|
153
|
-
)
|