acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,6 @@
1
+ """
2
+ DataHub Airflow Plugin v2 for Airflow 3.x.
3
+
4
+ This module provides the DataHub listener implementation for Airflow 3.x,
5
+ using the native OpenLineage provider and SQL parser patches for lineage.
6
+ """
@@ -0,0 +1,408 @@
1
+ """
2
+ Patch for Airflow 3.0+ SQLParser to use DataHub's SQL parser.
3
+
4
+ In Airflow 3.0+, SQL operators call SQLParser.generate_openlineage_metadata_from_sql()
5
+ directly rather than using extractors. This module patches that method to use DataHub's
6
+ SQL parser, which provides better column-level lineage support.
7
+ """
8
+
9
+ import logging
10
+ from types import TracebackType
11
+ from typing import TYPE_CHECKING, Any, Callable, Optional
12
+
13
+ # Airflow 3.x specific imports (wrapped in try/except for version compatibility)
14
+ try:
15
+ from airflow.providers.openlineage.extractors import OperatorLineage
16
+ from airflow.providers.openlineage.sqlparser import DatabaseInfo
17
+ from openlineage.client.event_v2 import Dataset as OpenLineageDataset
18
+ from openlineage.client.facet import SqlJobFacet
19
+
20
+ AIRFLOW3_IMPORTS_AVAILABLE = True
21
+ except ImportError:
22
+ # Not available on Airflow < 3.0
23
+ # Set to None for runtime checks, type checker will see these as None
24
+ OperatorLineage = None # type: ignore[assignment,misc]
25
+ DatabaseInfo = None # type: ignore[assignment,misc]
26
+ OpenLineageDataset = None # type: ignore[assignment,misc]
27
+ SqlJobFacet = None # type: ignore[assignment,misc]
28
+ AIRFLOW3_IMPORTS_AVAILABLE = False
29
+
30
+ # DataHub imports (always available)
31
+ import datahub.emitter.mce_builder as builder
32
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
33
+ from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
34
+ from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
35
+
36
+ if TYPE_CHECKING:
37
+ from airflow.providers.openlineage.extractors import OperatorLineage
38
+ from airflow.providers.openlineage.sqlparser import DatabaseInfo
39
+ from openlineage.client.event_v2 import Dataset as OpenLineageDataset
40
+ from openlineage.client.facet import SqlJobFacet
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # Store the original SQLParser method for fallback
45
+ _original_sql_parser_method: Optional[Callable[..., Any]] = None
46
+
47
+
48
+ def _datahub_generate_openlineage_metadata_from_sql(
49
+ self: Any,
50
+ sql: Any,
51
+ hook: Any,
52
+ database_info: dict,
53
+ database: Optional[str] = None,
54
+ sqlalchemy_engine: Optional[Any] = None,
55
+ use_connection: bool = True,
56
+ ) -> Optional["OperatorLineage"]:
57
+ """
58
+ Override SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
59
+
60
+ This is necessary because in Airflow 3.0+, SQL operators call SQLParser directly
61
+ rather than using extractors. We intercept this call and use DataHub's SQL parser
62
+ to generate lineage with column-level lineage support.
63
+
64
+ When OpenLineage plugin is enabled (disable_openlineage_plugin=False), we call both
65
+ parsers: OpenLineage gets its own parsing results, while DataHub's enhanced parsing
66
+ is stored in a custom facet for the DataHub listener to extract.
67
+ """
68
+ try:
69
+ # Import here to avoid circular dependency (datahub_listener -> _airflow_compat -> this module)
70
+ from datahub_airflow_plugin._config import get_lineage_config
71
+ from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
72
+
73
+ # Check if OpenLineage plugin is enabled
74
+ try:
75
+ config = get_lineage_config()
76
+ openlineage_enabled = not config.disable_openlineage_plugin
77
+ except Exception as e:
78
+ logger.warning(
79
+ f"Could not load config to check disable_openlineage_plugin: {e}"
80
+ )
81
+ openlineage_enabled = False
82
+
83
+ # If OpenLineage is enabled, call the original parser first to get its results
84
+ ol_result = None
85
+ if openlineage_enabled and _original_sql_parser_method is not None:
86
+ try:
87
+ logger.debug(
88
+ "OpenLineage plugin enabled - calling original parser for OpenLineage"
89
+ )
90
+ ol_result = _original_sql_parser_method(
91
+ self,
92
+ sql,
93
+ hook,
94
+ database_info,
95
+ database,
96
+ sqlalchemy_engine,
97
+ use_connection,
98
+ )
99
+ logger.debug(f"OpenLineage parser result: {ol_result}")
100
+ except Exception as e:
101
+ logger.warning(
102
+ f"Error calling original OpenLineage parser, will use only DataHub parser: {e}",
103
+ exc_info=True,
104
+ )
105
+
106
+ # Handle missing database_info by creating a minimal one from connection
107
+ if database_info is None:
108
+ # Get basic properties from hook's connection
109
+ conn = getattr(hook, "get_connection", lambda: None)()
110
+ scheme = getattr(conn, "conn_type", None) if conn else None
111
+ db_name = getattr(conn, "schema", None) if conn else None
112
+
113
+ database_info = DatabaseInfo(
114
+ scheme=scheme,
115
+ authority=None,
116
+ database=db_name,
117
+ information_schema_columns=[],
118
+ information_schema_table_name="",
119
+ use_flat_cross_db_query=False,
120
+ is_information_schema_cross_db=False,
121
+ is_uppercase_names=False,
122
+ normalize_name_method=lambda x: x.lower(),
123
+ )
124
+ logger.debug(
125
+ f"Created minimal DatabaseInfo from connection: scheme={scheme}, database={db_name}"
126
+ )
127
+
128
+ # Get platform from dialect or from database_info scheme
129
+ # If dialect is "generic", prefer database_info.scheme (connection type)
130
+ platform = self.dialect or "sql"
131
+ if platform == "generic" and database_info:
132
+ # Use the actual connection type instead of "generic"
133
+ platform = getattr(database_info, "scheme", platform) or platform
134
+ if platform == "generic":
135
+ raise ValueError(
136
+ "Could not determine platform from generic dialect or database_info"
137
+ )
138
+
139
+ platform = OL_SCHEME_TWEAKS.get(platform, platform)
140
+
141
+ # Get default database and schema
142
+ # database_info is a DatabaseInfo object (dataclass/namedtuple), not a dict
143
+ default_database = database or getattr(database_info, "database", None)
144
+ default_schema = self.default_schema
145
+
146
+ # Handle list of SQL statements
147
+ if isinstance(sql, list):
148
+ logger.debug("Got list of SQL statements. Using first one for parsing.")
149
+ sql = sql[0] if sql else ""
150
+
151
+ # Check if SQL still contains templates (should be rendered by operator)
152
+ if "{{" in str(sql):
153
+ logger.warning(
154
+ f"SQL still contains Jinja templates - lineage extraction may fail. "
155
+ f"SQL: {sql[:200]}... "
156
+ f"This usually means templates weren't rendered before SQL parsing."
157
+ )
158
+
159
+ # Run DataHub's SQL parser
160
+ listener = get_airflow_plugin_listener()
161
+ graph = listener.graph if listener else None
162
+
163
+ logger.debug(
164
+ "Running DataHub SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
165
+ "with graph client" if graph else "in offline mode",
166
+ platform,
167
+ default_database,
168
+ default_schema,
169
+ sql,
170
+ )
171
+
172
+ sql_parsing_result = create_lineage_sql_parsed_result(
173
+ query=sql,
174
+ graph=graph,
175
+ platform=platform,
176
+ platform_instance=None,
177
+ env=builder.DEFAULT_ENV,
178
+ default_db=default_database,
179
+ default_schema=default_schema,
180
+ )
181
+
182
+ logger.debug(f"DataHub SQL parser result: {sql_parsing_result}")
183
+
184
+ # Store the sql_parsing_result in run_facets for later retrieval by the DataHub listener
185
+ # If OpenLineage plugin is enabled and we got a result from the original parser,
186
+ # use OpenLineage's result but add DataHub's parsing to the facets
187
+ if ol_result is not None:
188
+ logger.debug(
189
+ "Using OpenLineage parser result for OperatorLineage, "
190
+ "adding DataHub parsing to run_facets"
191
+ )
192
+ # Add DataHub's SQL parsing result to the existing run_facets
193
+ # OperatorLineage is frozen (uses @define), so we need to create a new dict
194
+ updated_run_facets = dict(ol_result.run_facets or {})
195
+ updated_run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = sql_parsing_result
196
+
197
+ # Create new OperatorLineage with OpenLineage's inputs/outputs but DataHub's facet
198
+ operator_lineage = OperatorLineage( # type: ignore[misc]
199
+ inputs=ol_result.inputs,
200
+ outputs=ol_result.outputs,
201
+ job_facets=ol_result.job_facets,
202
+ run_facets=updated_run_facets,
203
+ )
204
+ return operator_lineage
205
+
206
+ # OpenLineage is disabled or original parser failed - use DataHub's parsing for everything
207
+ logger.debug(
208
+ "OpenLineage plugin disabled or parser unavailable - "
209
+ "using DataHub parser result for OperatorLineage"
210
+ )
211
+
212
+ # Convert DataHub URNs to OpenLineage Dataset objects
213
+ def _urn_to_ol_dataset(urn: str) -> "OpenLineageDataset":
214
+ """Convert DataHub URN to OpenLineage Dataset format."""
215
+ # Parse URN to extract database, schema, table
216
+ # URN format: urn:li:dataset:(urn:li:dataPlatform:{platform},{database}.{schema}.{table},{env})
217
+ try:
218
+ parts = urn.split(",")
219
+ if len(parts) >= 2:
220
+ # Extract table path from URN
221
+ table_path = parts[1] # e.g., "database.schema.table"
222
+
223
+ # Create OL namespace and name
224
+ # For now, use platform as namespace and full path as name
225
+ namespace = f"{platform}://{default_database or 'default'}"
226
+ name = table_path
227
+
228
+ return OpenLineageDataset(namespace=namespace, name=name)
229
+ except Exception as e:
230
+ logger.debug(f"Error converting URN {urn} to OL Dataset: {e}")
231
+
232
+ # Fallback: use URN as name
233
+ return OpenLineageDataset(namespace=f"{platform}://default", name=urn)
234
+
235
+ inputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.in_tables]
236
+ outputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.out_tables]
237
+
238
+ run_facets = {DATAHUB_SQL_PARSING_RESULT_KEY: sql_parsing_result}
239
+
240
+ # Create OperatorLineage with DataHub's results
241
+ operator_lineage = OperatorLineage( # type: ignore[misc]
242
+ inputs=inputs,
243
+ outputs=outputs,
244
+ job_facets={"sql": SqlJobFacet(query=sql)},
245
+ run_facets=run_facets,
246
+ )
247
+ return operator_lineage
248
+
249
+ except Exception as e:
250
+ logger.warning(
251
+ f"Error in DataHub SQL parser, falling back to default OpenLineage parser: {e}",
252
+ exc_info=True,
253
+ )
254
+ # Fall back to original implementation
255
+ if _original_sql_parser_method is None:
256
+ raise RuntimeError(
257
+ "Original SQLParser method not stored. patch_sqlparser() may not have been called."
258
+ ) from None
259
+ return _original_sql_parser_method(
260
+ self, sql, hook, database_info, database, sqlalchemy_engine, use_connection
261
+ )
262
+
263
+
264
+ class SQLParserPatch:
265
+ """
266
+ Context manager for patching Airflow's SQLParser with DataHub's SQL parser.
267
+
268
+ This class encapsulates the patching logic and manages the global state properly.
269
+ It can be used as a context manager for automatic cleanup, or with explicit
270
+ patch/unpatch methods for manual control.
271
+
272
+ Usage:
273
+ # As a context manager (recommended for testing)
274
+ with SQLParserPatch():
275
+ # Code runs with patched SQLParser
276
+ pass
277
+ # Automatically unpatched on exit
278
+
279
+ # Or with explicit control
280
+ patcher = SQLParserPatch()
281
+ patcher.patch()
282
+ try:
283
+ # ... plugin lifetime ...
284
+ finally:
285
+ patcher.unpatch()
286
+
287
+ The patch stores the original SQLParser method and replaces it with DataHub's
288
+ enhanced implementation that provides column-level lineage support.
289
+ """
290
+
291
+ def patch(self) -> "SQLParserPatch":
292
+ """
293
+ Apply the SQLParser patch.
294
+
295
+ Stores the original SQLParser.generate_openlineage_metadata_from_sql method
296
+ and replaces it with DataHub's enhanced implementation.
297
+
298
+ Returns:
299
+ self for method chaining
300
+ """
301
+ global _original_sql_parser_method
302
+
303
+ try:
304
+ from airflow.providers.openlineage.sqlparser import SQLParser
305
+
306
+ # Store original method for fallback (only if not already patched)
307
+ if _original_sql_parser_method is None:
308
+ _original_sql_parser_method = (
309
+ SQLParser.generate_openlineage_metadata_from_sql
310
+ )
311
+
312
+ SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[assignment,method-assign]
313
+ _datahub_generate_openlineage_metadata_from_sql # type: ignore[assignment,method-assign]
314
+ )
315
+ logger.debug(
316
+ "Patched SQLParser.generate_openlineage_metadata_from_sql with DataHub SQL parser"
317
+ )
318
+
319
+ except ImportError:
320
+ # SQLParser not available (Airflow < 3.0 or openlineage provider not installed)
321
+ logger.debug(
322
+ "SQLParser not available, skipping patch (likely Airflow < 3.0)"
323
+ )
324
+
325
+ return self
326
+
327
+ def unpatch(self) -> "SQLParserPatch":
328
+ """
329
+ Remove the SQLParser patch and restore the original method.
330
+
331
+ This is primarily useful for testing to ensure clean state between tests.
332
+ In production, the patch typically stays active for the process lifetime.
333
+
334
+ Returns:
335
+ self for method chaining
336
+ """
337
+ global _original_sql_parser_method
338
+
339
+ if _original_sql_parser_method is None:
340
+ logger.debug("SQLParser not patched, nothing to unpatch")
341
+ return self
342
+
343
+ try:
344
+ from airflow.providers.openlineage.sqlparser import SQLParser
345
+
346
+ # Restore original method
347
+ SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[method-assign]
348
+ _original_sql_parser_method
349
+ )
350
+ logger.debug("Unpatched SQLParser, restored original method")
351
+
352
+ except ImportError:
353
+ logger.debug("SQLParser not available, nothing to unpatch")
354
+ finally:
355
+ # Clear the stored reference to allow re-patching
356
+ _original_sql_parser_method = None
357
+
358
+ return self
359
+
360
+ def __enter__(self) -> "SQLParserPatch":
361
+ """Context manager entry: apply the patch."""
362
+ return self.patch()
363
+
364
+ def __exit__(
365
+ self,
366
+ exc_type: Optional[type[BaseException]],
367
+ exc_val: Optional[BaseException],
368
+ exc_tb: Optional[TracebackType],
369
+ ) -> None:
370
+ """Context manager exit: remove the patch."""
371
+ self.unpatch()
372
+
373
+
374
+ # Global patcher instance for backward compatibility
375
+ _global_patcher = SQLParserPatch()
376
+
377
+
378
+ def patch_sqlparser() -> None:
379
+ """
380
+ Patch SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
381
+
382
+ This is a convenience function that wraps SQLParserPatch.patch() for backward
383
+ compatibility with existing code.
384
+
385
+ This should be called early in the plugin initialization, before any SQL operators are used.
386
+
387
+ When both DataHub and OpenLineage plugins are enabled (disable_openlineage_plugin=False),
388
+ the patch calls BOTH parsers:
389
+ - OpenLineage's original parser provides inputs/outputs for OpenLineage plugin
390
+ - DataHub's enhanced parser (with column-level lineage) is stored in run_facets
391
+ for DataHub listener to extract
392
+
393
+ When only DataHub is enabled (disable_openlineage_plugin=True), only DataHub's
394
+ parser runs and provides both the OperatorLineage structure and the enhanced parsing.
395
+ """
396
+ _global_patcher.patch()
397
+
398
+
399
+ def unpatch_sqlparser() -> None:
400
+ """
401
+ Remove the SQLParser patch and restore the original method.
402
+
403
+ This is a convenience function that wraps SQLParserPatch.unpatch() for consistency.
404
+
405
+ This is primarily useful for testing to ensure clean state between tests.
406
+ In production, the patch typically stays active for the process lifetime.
407
+ """
408
+ _global_patcher.unpatch()
@@ -0,0 +1,108 @@
1
+ # Airflow 3.x compatibility module
2
+ # This module must be imported before any Airflow imports in any of our files.
3
+
4
+ import logging
5
+ from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Critical safety check: Ensure MarkupSafe compatibility patch is applied
10
+ # This must happen before importing Airflow to prevent MarkupSafe version conflicts
11
+ # Using explicit exception instead of assert to ensure it runs even with python -O
12
+ if not MARKUPSAFE_PATCHED:
13
+ raise RuntimeError(
14
+ "MarkupSafe compatibility patch must be applied before importing Airflow modules. "
15
+ "This is a critical safety check that cannot be disabled. "
16
+ "The patch ensures compatibility between different MarkupSafe versions used by "
17
+ "Airflow and DataHub dependencies."
18
+ )
19
+
20
+ # Apply Airflow 3.x patches
21
+ # These imports must be after MARKUPSAFE_PATCHED assertion because they import Airflow modules.
22
+ # We need to ensure markupsafe is patched first to maintain compatibility.
23
+
24
+ # Load configuration to determine which patches to apply
25
+ try:
26
+ from datahub_airflow_plugin._config import get_lineage_config
27
+
28
+ config = get_lineage_config()
29
+ enable_extractors = config.enable_extractors
30
+ patch_sql_parser = config.patch_sql_parser
31
+ extract_athena_operator = config.extract_athena_operator
32
+ extract_bigquery_insert_job_operator = config.extract_bigquery_insert_job_operator
33
+ extract_teradata_operator = config.extract_teradata_operator
34
+ except Exception:
35
+ # If config loading fails, apply all patches by default (backward compatibility)
36
+ enable_extractors = True
37
+ patch_sql_parser = True
38
+ extract_athena_operator = True
39
+ extract_bigquery_insert_job_operator = True
40
+ extract_teradata_operator = True
41
+
42
+ # Only apply patches if extractors are enabled
43
+ if enable_extractors:
44
+ # Airflow 3.0+ SQLParser patch
45
+ if patch_sql_parser:
46
+ try:
47
+ from datahub_airflow_plugin.airflow3._airflow3_sql_parser_patch import (
48
+ patch_sqlparser,
49
+ )
50
+
51
+ patch_sqlparser()
52
+ # Log success for debugging
53
+ logger.debug("✓ Successfully applied Airflow 3 SQL parser patch")
54
+ except ImportError as e:
55
+ # Not available when openlineage packages aren't installed
56
+ logger.warning(
57
+ f"SQL parser patch not applied - OpenLineage packages not available: {e}"
58
+ )
59
+ except Exception as e:
60
+ # Log any other errors
61
+ logger.warning(f"Failed to apply SQL parser patch: {e}", exc_info=True)
62
+
63
+ # Operator-specific patches (conditional based on config and operator availability)
64
+ # SQLite patch is always applied when available (no config flag yet)
65
+ try:
66
+ from datahub_airflow_plugin.airflow3._sqlite_openlineage_patch import (
67
+ patch_sqlite_hook,
68
+ )
69
+
70
+ patch_sqlite_hook()
71
+ except ImportError:
72
+ pass
73
+
74
+ if extract_athena_operator:
75
+ try:
76
+ from datahub_airflow_plugin.airflow3._athena_openlineage_patch import (
77
+ patch_athena_operator,
78
+ )
79
+
80
+ patch_athena_operator()
81
+ except ImportError:
82
+ pass
83
+
84
+ if extract_bigquery_insert_job_operator:
85
+ try:
86
+ from datahub_airflow_plugin.airflow3._bigquery_openlineage_patch import (
87
+ patch_bigquery_insert_job_operator,
88
+ )
89
+
90
+ patch_bigquery_insert_job_operator()
91
+ except ImportError:
92
+ pass
93
+
94
+ if extract_teradata_operator:
95
+ try:
96
+ from datahub_airflow_plugin.airflow3._teradata_openlineage_patch import (
97
+ patch_teradata_operator,
98
+ )
99
+
100
+ patch_teradata_operator()
101
+ except ImportError:
102
+ pass
103
+
104
+ AIRFLOW_PATCHED = True
105
+
106
+ __all__ = [
107
+ "AIRFLOW_PATCHED",
108
+ ]
@@ -0,0 +1,153 @@
1
+ """
2
+ Patch for AthenaOperator to use DataHub's SQL parser.
3
+
4
+ AthenaOperator in Airflow 3.x uses SQLParser with dialect="generic", which doesn't provide
5
+ column-level lineage. This patch modifies get_openlineage_facets_on_complete() to use
6
+ DataHub's SQL parser instead, enabling column-level lineage extraction.
7
+ """
8
+
9
+ import logging
10
+ from typing import TYPE_CHECKING, Any, Optional
11
+
12
+ import datahub.emitter.mce_builder as builder
13
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
14
+ from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
15
+
16
+ if TYPE_CHECKING:
17
+ from airflow.models.taskinstance import TaskInstance
18
+ from airflow.providers.openlineage.extractors import OperatorLineage
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def patch_athena_operator() -> None:
24
+ """
25
+ Patch AthenaOperator to use DataHub's SQL parser for lineage extraction.
26
+
27
+ This enhances the existing OpenLineage support with DataHub's SQL parser,
28
+ which provides better column-level lineage.
29
+ """
30
+ try:
31
+ from airflow.providers.amazon.aws.operators.athena import AthenaOperator
32
+
33
+ # Check if the method exists (only in Airflow 3.x)
34
+ if not hasattr(AthenaOperator, "get_openlineage_facets_on_complete"):
35
+ logger.debug(
36
+ "AthenaOperator.get_openlineage_facets_on_complete not found - "
37
+ "likely Airflow 2.x, skipping patch"
38
+ )
39
+ return
40
+
41
+ # Check if already patched
42
+ if hasattr(AthenaOperator, "_datahub_openlineage_patched"):
43
+ logger.debug("AthenaOperator already patched for OpenLineage")
44
+ return
45
+
46
+ # Store original method
47
+ original_get_openlineage_facets_on_complete = (
48
+ AthenaOperator.get_openlineage_facets_on_complete
49
+ )
50
+
51
+ def get_openlineage_facets_on_complete(
52
+ self: Any, task_instance: "TaskInstance"
53
+ ) -> Optional["OperatorLineage"]:
54
+ """
55
+ Enhanced version that uses DataHub's SQL parser for better lineage.
56
+
57
+ This method:
58
+ 1. Calls the original OpenLineage implementation
59
+ 2. Enhances it with DataHub SQL parsing result for column lineage
60
+ """
61
+ try:
62
+ logger.debug(
63
+ f"DataHub patched Athena get_openlineage_facets_on_complete called for query: {self.query[:100]}"
64
+ )
65
+
66
+ # Get the original OpenLineage result
67
+ operator_lineage = original_get_openlineage_facets_on_complete(
68
+ self, task_instance
69
+ )
70
+
71
+ if not operator_lineage:
72
+ logger.debug(
73
+ "Original OpenLineage returned None for Athena operator"
74
+ )
75
+ return operator_lineage
76
+
77
+ logger.debug(
78
+ f"Original Athena OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
79
+ )
80
+
81
+ # Check if SQL parsing result is already in run_facets (from SQLParser patch)
82
+ # If not, add it manually since Athena might not use SQLParser or patch might not work
83
+ if DATAHUB_SQL_PARSING_RESULT_KEY not in operator_lineage.run_facets:
84
+ # SQLParser patch didn't add it - add it manually
85
+ try:
86
+ platform = "athena"
87
+ default_database = (
88
+ self.database if hasattr(self, "database") else None
89
+ )
90
+
91
+ # Get the SQL query - templates are already rendered by Airflow during task execution
92
+ rendered_query = self.query
93
+
94
+ logger.debug(
95
+ f"Running DataHub SQL parser for Athena (platform={platform}, "
96
+ f"default_db={default_database}): {rendered_query[:200] if rendered_query else 'None'}"
97
+ )
98
+
99
+ # Use DataHub's SQL parser
100
+ sql_parsing_result = create_lineage_sql_parsed_result(
101
+ query=rendered_query,
102
+ platform=platform,
103
+ platform_instance=None,
104
+ env=builder.DEFAULT_ENV,
105
+ default_db=default_database,
106
+ default_schema=None,
107
+ )
108
+
109
+ # Store the SQL parsing result in run_facets for DataHub listener
110
+ if sql_parsing_result:
111
+ operator_lineage.run_facets[
112
+ DATAHUB_SQL_PARSING_RESULT_KEY
113
+ ] = sql_parsing_result
114
+ logger.debug(
115
+ f"Added DataHub SQL parsing result with "
116
+ f"{len(sql_parsing_result.column_lineage or [])} column lineages"
117
+ )
118
+ except Exception as e:
119
+ logger.warning(
120
+ f"Error running DataHub SQL parser for Athena: {e}",
121
+ exc_info=True,
122
+ )
123
+ else:
124
+ logger.debug(
125
+ f"DataHub SQL parsing result already present in run_facets "
126
+ f"(added by SQLParser patch) with "
127
+ f"{len(operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY].column_lineage or [])} column lineages"
128
+ )
129
+
130
+ return operator_lineage
131
+
132
+ except Exception as e:
133
+ logger.warning(
134
+ f"Error in patched AthenaOperator.get_openlineage_facets_on_complete: {e}",
135
+ exc_info=True,
136
+ )
137
+ # Fall back to original method
138
+ return original_get_openlineage_facets_on_complete(self, task_instance)
139
+
140
+ # Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
141
+ AthenaOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
142
+ get_openlineage_facets_on_complete # type: ignore[assignment]
143
+ )
144
+ AthenaOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
145
+
146
+ logger.debug(
147
+ "Patched AthenaOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
148
+ )
149
+
150
+ except ImportError as e:
151
+ logger.debug(
152
+ f"Could not patch AthenaOperator for OpenLineage (provider not installed or Airflow < 3.0): {e}"
153
+ )