acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Patch for Airflow 2.10+ with apache-airflow-providers-openlineage SQLParser.
|
|
3
|
+
|
|
4
|
+
When using apache-airflow-providers-openlineage with Airflow 2.10+, SQL operators call
|
|
5
|
+
SQLParser.generate_openlineage_metadata_from_sql() directly rather than using extractors.
|
|
6
|
+
This module patches that method to use DataHub's SQL parser, which provides better
|
|
7
|
+
column-level lineage support.
|
|
8
|
+
|
|
9
|
+
This is analogous to the Airflow 3 SQL parser patch, but adapted for Airflow 2.10+
|
|
10
|
+
when the provider package is installed.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from types import TracebackType
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
16
|
+
|
|
17
|
+
# Try importing from provider package (Airflow 2.10+ with apache-airflow-providers-openlineage)
|
|
18
|
+
try:
|
|
19
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
20
|
+
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
21
|
+
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
22
|
+
from openlineage.client.facet import SqlJobFacet
|
|
23
|
+
|
|
24
|
+
PROVIDER_IMPORTS_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
# Provider package not available
|
|
27
|
+
OperatorLineage = None # type: ignore[assignment,misc]
|
|
28
|
+
DatabaseInfo = None # type: ignore[assignment,misc]
|
|
29
|
+
OpenLineageDataset = None # type: ignore[assignment,misc]
|
|
30
|
+
SqlJobFacet = None # type: ignore[assignment,misc]
|
|
31
|
+
PROVIDER_IMPORTS_AVAILABLE = False
|
|
32
|
+
|
|
33
|
+
# DataHub imports (always available)
|
|
34
|
+
import datahub.emitter.mce_builder as builder
|
|
35
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
36
|
+
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
37
|
+
from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
41
|
+
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
42
|
+
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
43
|
+
from openlineage.client.facet import SqlJobFacet
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
# Store the original SQLParser method for fallback
|
|
48
|
+
_original_sql_parser_method: Optional[Callable[..., Any]] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _datahub_generate_openlineage_metadata_from_sql(
|
|
52
|
+
self: Any,
|
|
53
|
+
sql: Any,
|
|
54
|
+
hook: Any,
|
|
55
|
+
database_info: dict,
|
|
56
|
+
database: Optional[str] = None,
|
|
57
|
+
sqlalchemy_engine: Optional[Any] = None,
|
|
58
|
+
use_connection: bool = True,
|
|
59
|
+
) -> Optional["OperatorLineage"]:
|
|
60
|
+
"""
|
|
61
|
+
Override SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
62
|
+
|
|
63
|
+
This is necessary because in Airflow 2.10+ with provider package, SQL operators call
|
|
64
|
+
SQLParser directly rather than using extractors. We intercept this call and use
|
|
65
|
+
DataHub's SQL parser to generate lineage with column-level lineage support.
|
|
66
|
+
|
|
67
|
+
When OpenLineage plugin is enabled (disable_openlineage_plugin=False), we call both
|
|
68
|
+
parsers: OpenLineage gets its own parsing results, while DataHub's enhanced parsing
|
|
69
|
+
is stored in a custom facet for the DataHub listener to extract.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
# Import here to avoid circular dependency
|
|
73
|
+
from datahub_airflow_plugin._config import get_lineage_config
|
|
74
|
+
from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
|
|
75
|
+
|
|
76
|
+
# Check if OpenLineage plugin is enabled
|
|
77
|
+
try:
|
|
78
|
+
config = get_lineage_config()
|
|
79
|
+
openlineage_enabled = not config.disable_openlineage_plugin
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warning(
|
|
82
|
+
f"Could not load config to check disable_openlineage_plugin: {e}"
|
|
83
|
+
)
|
|
84
|
+
openlineage_enabled = False
|
|
85
|
+
|
|
86
|
+
# If OpenLineage is enabled, call the original parser first to get its results
|
|
87
|
+
ol_result = None
|
|
88
|
+
if openlineage_enabled and _original_sql_parser_method is not None:
|
|
89
|
+
try:
|
|
90
|
+
logger.debug(
|
|
91
|
+
"OpenLineage plugin enabled - calling original parser for OpenLineage"
|
|
92
|
+
)
|
|
93
|
+
ol_result = _original_sql_parser_method(
|
|
94
|
+
self,
|
|
95
|
+
sql,
|
|
96
|
+
hook,
|
|
97
|
+
database_info,
|
|
98
|
+
database,
|
|
99
|
+
sqlalchemy_engine,
|
|
100
|
+
use_connection,
|
|
101
|
+
)
|
|
102
|
+
logger.debug(f"OpenLineage parser result: {ol_result}")
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.warning(
|
|
105
|
+
f"Error calling original OpenLineage parser, will use only DataHub parser: {e}",
|
|
106
|
+
exc_info=True,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Handle missing database_info by creating a minimal one from connection
|
|
110
|
+
if database_info is None:
|
|
111
|
+
# Get basic properties from hook's connection
|
|
112
|
+
conn = getattr(hook, "get_connection", lambda: None)()
|
|
113
|
+
scheme = getattr(conn, "conn_type", None) if conn else None
|
|
114
|
+
db_name = getattr(conn, "schema", None) if conn else None
|
|
115
|
+
|
|
116
|
+
database_info = DatabaseInfo(
|
|
117
|
+
scheme=scheme,
|
|
118
|
+
authority=None,
|
|
119
|
+
database=db_name,
|
|
120
|
+
information_schema_columns=[],
|
|
121
|
+
information_schema_table_name="",
|
|
122
|
+
use_flat_cross_db_query=False,
|
|
123
|
+
is_information_schema_cross_db=False,
|
|
124
|
+
is_uppercase_names=False,
|
|
125
|
+
normalize_name_method=lambda x: x.lower(),
|
|
126
|
+
)
|
|
127
|
+
logger.debug(
|
|
128
|
+
f"Created minimal DatabaseInfo from connection: scheme={scheme}, database={db_name}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Get platform from dialect or from database_info scheme
|
|
132
|
+
# If dialect is "generic", prefer database_info.scheme (connection type)
|
|
133
|
+
platform = self.dialect or "sql"
|
|
134
|
+
if platform == "generic" and database_info:
|
|
135
|
+
# Use the actual connection type instead of "generic"
|
|
136
|
+
platform = getattr(database_info, "scheme", platform) or platform
|
|
137
|
+
if platform == "generic":
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"Could not determine platform from generic dialect or database_info"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
platform = OL_SCHEME_TWEAKS.get(platform, platform)
|
|
143
|
+
|
|
144
|
+
# Get default database and schema
|
|
145
|
+
default_database = database or getattr(database_info, "database", None)
|
|
146
|
+
default_schema = self.default_schema
|
|
147
|
+
|
|
148
|
+
# Handle list of SQL statements
|
|
149
|
+
if isinstance(sql, list):
|
|
150
|
+
logger.debug("Got list of SQL statements. Using first one for parsing.")
|
|
151
|
+
sql = sql[0] if sql else ""
|
|
152
|
+
|
|
153
|
+
# Run DataHub's SQL parser
|
|
154
|
+
listener = get_airflow_plugin_listener()
|
|
155
|
+
graph = listener.graph if listener else None
|
|
156
|
+
|
|
157
|
+
logger.debug(
|
|
158
|
+
"Running DataHub SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
|
|
159
|
+
"with graph client" if graph else "in offline mode",
|
|
160
|
+
platform,
|
|
161
|
+
default_database,
|
|
162
|
+
default_schema,
|
|
163
|
+
sql,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
167
|
+
query=sql,
|
|
168
|
+
graph=graph,
|
|
169
|
+
platform=platform,
|
|
170
|
+
platform_instance=None,
|
|
171
|
+
env=builder.DEFAULT_ENV,
|
|
172
|
+
default_db=default_database,
|
|
173
|
+
default_schema=default_schema,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
logger.debug(f"DataHub SQL parser result: {sql_parsing_result}")
|
|
177
|
+
|
|
178
|
+
# Store the sql_parsing_result in run_facets for later retrieval by the DataHub listener
|
|
179
|
+
# If OpenLineage plugin is enabled and we got a result from the original parser,
|
|
180
|
+
# use OpenLineage's result but add DataHub's parsing to the facets
|
|
181
|
+
if ol_result is not None:
|
|
182
|
+
logger.debug(
|
|
183
|
+
"Using OpenLineage parser result for OperatorLineage, "
|
|
184
|
+
"adding DataHub parsing to run_facets"
|
|
185
|
+
)
|
|
186
|
+
# Add DataHub's SQL parsing result to the existing run_facets
|
|
187
|
+
# OperatorLineage is frozen (uses @define), so we need to create a new dict
|
|
188
|
+
updated_run_facets = dict(ol_result.run_facets or {})
|
|
189
|
+
updated_run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = sql_parsing_result
|
|
190
|
+
|
|
191
|
+
# Create new OperatorLineage with OpenLineage's inputs/outputs but DataHub's facet
|
|
192
|
+
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
193
|
+
inputs=ol_result.inputs,
|
|
194
|
+
outputs=ol_result.outputs,
|
|
195
|
+
job_facets=ol_result.job_facets,
|
|
196
|
+
run_facets=updated_run_facets,
|
|
197
|
+
)
|
|
198
|
+
return operator_lineage
|
|
199
|
+
|
|
200
|
+
# OpenLineage is disabled or original parser failed - use DataHub's parsing for everything
|
|
201
|
+
logger.debug(
|
|
202
|
+
"OpenLineage plugin disabled or parser unavailable - "
|
|
203
|
+
"using DataHub parser result for OperatorLineage"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Convert DataHub URNs to OpenLineage Dataset objects
|
|
207
|
+
def _urn_to_ol_dataset(urn: str) -> "OpenLineageDataset":
|
|
208
|
+
"""Convert DataHub URN to OpenLineage Dataset format."""
|
|
209
|
+
# Parse URN to extract database, schema, table
|
|
210
|
+
# URN format: urn:li:dataset:(urn:li:dataPlatform:{platform},{database}.{schema}.{table},{env})
|
|
211
|
+
try:
|
|
212
|
+
parts = urn.split(",")
|
|
213
|
+
if len(parts) >= 2:
|
|
214
|
+
# Extract table path from URN
|
|
215
|
+
table_path = parts[1] # e.g., "database.schema.table"
|
|
216
|
+
|
|
217
|
+
# Create OL namespace and name
|
|
218
|
+
# For now, use platform as namespace and full path as name
|
|
219
|
+
namespace = f"{platform}://{default_database or 'default'}"
|
|
220
|
+
name = table_path
|
|
221
|
+
|
|
222
|
+
return OpenLineageDataset(namespace=namespace, name=name)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.debug(f"Error converting URN {urn} to OL Dataset: {e}")
|
|
225
|
+
|
|
226
|
+
# Fallback: use URN as name
|
|
227
|
+
return OpenLineageDataset(namespace=f"{platform}://default", name=urn)
|
|
228
|
+
|
|
229
|
+
inputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.in_tables]
|
|
230
|
+
outputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.out_tables]
|
|
231
|
+
|
|
232
|
+
run_facets = {DATAHUB_SQL_PARSING_RESULT_KEY: sql_parsing_result}
|
|
233
|
+
|
|
234
|
+
# Create OperatorLineage with DataHub's results
|
|
235
|
+
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
236
|
+
inputs=inputs,
|
|
237
|
+
outputs=outputs,
|
|
238
|
+
job_facets={"sql": SqlJobFacet(query=sql)},
|
|
239
|
+
run_facets=run_facets,
|
|
240
|
+
)
|
|
241
|
+
return operator_lineage
|
|
242
|
+
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning(
|
|
245
|
+
f"Error in DataHub SQL parser, falling back to default OpenLineage parser: {e}",
|
|
246
|
+
exc_info=True,
|
|
247
|
+
)
|
|
248
|
+
# Fall back to original implementation
|
|
249
|
+
if _original_sql_parser_method is None:
|
|
250
|
+
raise RuntimeError(
|
|
251
|
+
"Original SQLParser method not stored. patch_sqlparser() may not have been called."
|
|
252
|
+
) from None
|
|
253
|
+
return _original_sql_parser_method(
|
|
254
|
+
self, sql, hook, database_info, database, sqlalchemy_engine, use_connection
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class SQLParserPatch:
|
|
259
|
+
"""
|
|
260
|
+
Context manager for patching Airflow's SQLParser with DataHub's SQL parser.
|
|
261
|
+
|
|
262
|
+
This class encapsulates the patching logic and manages the global state properly.
|
|
263
|
+
It can be used as a context manager for automatic cleanup, or with explicit
|
|
264
|
+
patch/unpatch methods for manual control.
|
|
265
|
+
|
|
266
|
+
Usage:
|
|
267
|
+
# As a context manager (recommended for testing)
|
|
268
|
+
with SQLParserPatch():
|
|
269
|
+
# Code runs with patched SQLParser
|
|
270
|
+
pass
|
|
271
|
+
# Automatically unpatched on exit
|
|
272
|
+
|
|
273
|
+
# Or with explicit control
|
|
274
|
+
patcher = SQLParserPatch()
|
|
275
|
+
patcher.patch()
|
|
276
|
+
try:
|
|
277
|
+
# ... plugin lifetime ...
|
|
278
|
+
finally:
|
|
279
|
+
patcher.unpatch()
|
|
280
|
+
|
|
281
|
+
The patch stores the original SQLParser method and replaces it with DataHub's
|
|
282
|
+
enhanced implementation that provides column-level lineage support.
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
def patch(self) -> "SQLParserPatch":
|
|
286
|
+
"""
|
|
287
|
+
Apply the SQLParser patch.
|
|
288
|
+
|
|
289
|
+
Stores the original SQLParser.generate_openlineage_metadata_from_sql method
|
|
290
|
+
and replaces it with DataHub's enhanced implementation.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
self for method chaining
|
|
294
|
+
"""
|
|
295
|
+
global _original_sql_parser_method
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
299
|
+
|
|
300
|
+
# Store original method for fallback (only if not already patched)
|
|
301
|
+
if _original_sql_parser_method is None:
|
|
302
|
+
_original_sql_parser_method = (
|
|
303
|
+
SQLParser.generate_openlineage_metadata_from_sql
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[assignment,method-assign]
|
|
307
|
+
_datahub_generate_openlineage_metadata_from_sql # type: ignore[assignment,method-assign]
|
|
308
|
+
)
|
|
309
|
+
logger.debug(
|
|
310
|
+
"Patched SQLParser.generate_openlineage_metadata_from_sql with DataHub SQL parser"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
except ImportError:
|
|
314
|
+
# SQLParser not available (provider package not installed or Airflow < 2.10)
|
|
315
|
+
logger.debug(
|
|
316
|
+
"SQLParser not available, skipping patch (likely Airflow < 2.10 or provider package not installed)"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return self
|
|
320
|
+
|
|
321
|
+
def unpatch(self) -> "SQLParserPatch":
|
|
322
|
+
"""
|
|
323
|
+
Remove the SQLParser patch and restore the original method.
|
|
324
|
+
|
|
325
|
+
This is primarily useful for testing to ensure clean state between tests.
|
|
326
|
+
In production, the patch typically stays active for the process lifetime.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
self for method chaining
|
|
330
|
+
"""
|
|
331
|
+
global _original_sql_parser_method
|
|
332
|
+
|
|
333
|
+
if _original_sql_parser_method is None:
|
|
334
|
+
logger.debug("SQLParser not patched, nothing to unpatch")
|
|
335
|
+
return self
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
339
|
+
|
|
340
|
+
# Restore original method
|
|
341
|
+
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[method-assign]
|
|
342
|
+
_original_sql_parser_method
|
|
343
|
+
)
|
|
344
|
+
logger.debug("Unpatched SQLParser, restored original method")
|
|
345
|
+
|
|
346
|
+
except ImportError:
|
|
347
|
+
logger.debug("SQLParser not available, nothing to unpatch")
|
|
348
|
+
finally:
|
|
349
|
+
# Clear the stored reference to allow re-patching
|
|
350
|
+
_original_sql_parser_method = None
|
|
351
|
+
|
|
352
|
+
return self
|
|
353
|
+
|
|
354
|
+
def __enter__(self) -> "SQLParserPatch":
|
|
355
|
+
"""Context manager entry: apply the patch."""
|
|
356
|
+
return self.patch()
|
|
357
|
+
|
|
358
|
+
def __exit__(
|
|
359
|
+
self,
|
|
360
|
+
exc_type: Optional[type[BaseException]],
|
|
361
|
+
exc_val: Optional[BaseException],
|
|
362
|
+
exc_tb: Optional[TracebackType],
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Context manager exit: remove the patch."""
|
|
365
|
+
self.unpatch()
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# Global patcher instance for backward compatibility
|
|
369
|
+
_global_patcher = SQLParserPatch()
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def patch_sqlparser() -> None:
|
|
373
|
+
"""
|
|
374
|
+
Patch SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
375
|
+
|
|
376
|
+
This is a convenience function that wraps SQLParserPatch.patch() for backward
|
|
377
|
+
compatibility with existing code.
|
|
378
|
+
|
|
379
|
+
This should be called early in the plugin initialization, before any SQL operators are used.
|
|
380
|
+
|
|
381
|
+
When both DataHub and OpenLineage plugins are enabled (disable_openlineage_plugin=False),
|
|
382
|
+
the patch calls BOTH parsers:
|
|
383
|
+
- OpenLineage's original parser provides inputs/outputs for OpenLineage plugin
|
|
384
|
+
- DataHub's enhanced parser (with column-level lineage) is stored in run_facets
|
|
385
|
+
for DataHub listener to extract
|
|
386
|
+
|
|
387
|
+
When only DataHub is enabled (disable_openlineage_plugin=True), only DataHub's
|
|
388
|
+
parser runs and provides both the OperatorLineage structure and the enhanced parsing.
|
|
389
|
+
"""
|
|
390
|
+
_global_patcher.patch()
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def unpatch_sqlparser() -> None:
|
|
394
|
+
"""
|
|
395
|
+
Remove the SQLParser patch and restore the original method.
|
|
396
|
+
|
|
397
|
+
This is a convenience function that wraps SQLParserPatch.unpatch() for consistency.
|
|
398
|
+
|
|
399
|
+
This is primarily useful for testing to ensure clean state between tests.
|
|
400
|
+
In production, the patch typically stays active for the process lifetime.
|
|
401
|
+
"""
|
|
402
|
+
_global_patcher.unpatch()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Airflow 2.x compatibility module
|
|
2
|
+
# This module must be imported before any Airflow imports in any of our files.
|
|
3
|
+
|
|
4
|
+
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
|
5
|
+
|
|
6
|
+
# Critical safety check: Ensure MarkupSafe compatibility patch is applied
|
|
7
|
+
# This must happen before importing Airflow to prevent MarkupSafe version conflicts
|
|
8
|
+
# Using explicit exception instead of assert to ensure it runs even with python -O
|
|
9
|
+
if not MARKUPSAFE_PATCHED:
|
|
10
|
+
raise RuntimeError(
|
|
11
|
+
"MarkupSafe compatibility patch must be applied before importing Airflow modules. "
|
|
12
|
+
"This is a critical safety check that cannot be disabled. "
|
|
13
|
+
"The patch ensures compatibility between different MarkupSafe versions used by "
|
|
14
|
+
"Airflow and DataHub dependencies."
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Apply SQLParser patch for Airflow 2.10+ with apache-airflow-providers-openlineage
|
|
18
|
+
# When using the provider package, SQL operators call SQLParser.generate_openlineage_metadata_from_sql()
|
|
19
|
+
# directly (similar to Airflow 3.x), so we need to patch that method to use DataHub's SQL parser.
|
|
20
|
+
#
|
|
21
|
+
# For legacy openlineage-airflow package (Airflow 2.5-2.9), we use the extractor-based approach
|
|
22
|
+
# in _extractors.py instead.
|
|
23
|
+
import importlib.util
|
|
24
|
+
import logging
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# Check if OpenLineage provider package is available
|
|
29
|
+
# Use try-except because find_spec can raise ModuleNotFoundError if parent module doesn't exist
|
|
30
|
+
try:
|
|
31
|
+
has_openlineage_provider = (
|
|
32
|
+
importlib.util.find_spec("airflow.providers.openlineage.sqlparser") is not None
|
|
33
|
+
)
|
|
34
|
+
except (ModuleNotFoundError, ImportError, ValueError):
|
|
35
|
+
# Parent module doesn't exist or other import error
|
|
36
|
+
has_openlineage_provider = False
|
|
37
|
+
|
|
38
|
+
if has_openlineage_provider:
|
|
39
|
+
# Provider package detected - apply SQL parser patch
|
|
40
|
+
from datahub_airflow_plugin.airflow2._airflow2_sql_parser_patch import (
|
|
41
|
+
patch_sqlparser,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
patch_sqlparser()
|
|
45
|
+
else:
|
|
46
|
+
# Provider package not available - using legacy openlineage-airflow package
|
|
47
|
+
# No patching needed, extractors will handle SQL parsing
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
# Apply operator-specific patches for provider mode
|
|
51
|
+
# These patches work for both Airflow 2.x and 3.x when using OpenLineage provider
|
|
52
|
+
try:
|
|
53
|
+
from datahub_airflow_plugin._config import get_lineage_config
|
|
54
|
+
|
|
55
|
+
config = get_lineage_config()
|
|
56
|
+
enable_extractors = config.enable_extractors
|
|
57
|
+
extract_teradata_operator = config.extract_teradata_operator
|
|
58
|
+
except Exception:
|
|
59
|
+
# If config loading fails, apply patches by default (backward compatibility)
|
|
60
|
+
enable_extractors = True
|
|
61
|
+
extract_teradata_operator = True
|
|
62
|
+
|
|
63
|
+
if enable_extractors and extract_teradata_operator:
|
|
64
|
+
# TeradataOperator patch - works for both Airflow 2.x provider mode and Airflow 3.x
|
|
65
|
+
# The patch checks for method existence, so it's safe to import from airflow3 module
|
|
66
|
+
# Note: We defer the import to avoid potential issues with Airflow 3.x specific imports
|
|
67
|
+
# in Airflow 2.x environments. The patch function itself handles version compatibility.
|
|
68
|
+
import logging
|
|
69
|
+
|
|
70
|
+
logger = logging.getLogger(__name__)
|
|
71
|
+
try:
|
|
72
|
+
logger.debug("Attempting to import and apply TeradataOperator patch")
|
|
73
|
+
# Use importlib to safely import the patch module
|
|
74
|
+
import importlib.util
|
|
75
|
+
|
|
76
|
+
patch_module_path = (
|
|
77
|
+
"datahub_airflow_plugin.airflow3._teradata_openlineage_patch"
|
|
78
|
+
)
|
|
79
|
+
patch_module = importlib.import_module(patch_module_path)
|
|
80
|
+
patch_teradata_operator = patch_module.patch_teradata_operator
|
|
81
|
+
|
|
82
|
+
patch_teradata_operator()
|
|
83
|
+
logger.debug("TeradataOperator patch import and call completed")
|
|
84
|
+
except ImportError as e:
|
|
85
|
+
# Teradata provider not installed or patch not available
|
|
86
|
+
logger.debug(f"Could not import TeradataOperator patch: {e}")
|
|
87
|
+
except Exception as e:
|
|
88
|
+
# Log error but don't fail - this is optional functionality
|
|
89
|
+
logger.warning(f"Error applying TeradataOperator patch: {e}", exc_info=True)
|
|
90
|
+
|
|
91
|
+
AIRFLOW_PATCHED = True
|
|
92
|
+
|
|
93
|
+
__all__ = [
|
|
94
|
+
"AIRFLOW_PATCHED",
|
|
95
|
+
]
|