acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
  3. datahub_airflow_plugin/_airflow_shims.py +31 -64
  4. datahub_airflow_plugin/_config.py +19 -97
  5. datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
  6. datahub_airflow_plugin/_extractors.py +365 -0
  7. datahub_airflow_plugin/_version.py +1 -1
  8. datahub_airflow_plugin/client/airflow_generator.py +43 -147
  9. datahub_airflow_plugin/datahub_listener.py +790 -19
  10. datahub_airflow_plugin/example_dags/__init__.py +0 -32
  11. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
  12. datahub_airflow_plugin/hooks/datahub.py +2 -11
  13. datahub_airflow_plugin/operators/datahub.py +3 -20
  14. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
  15. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
  16. datahub_airflow_plugin/_airflow_compat.py +0 -32
  17. datahub_airflow_plugin/_airflow_version_specific.py +0 -184
  18. datahub_airflow_plugin/_constants.py +0 -16
  19. datahub_airflow_plugin/airflow2/__init__.py +0 -6
  20. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
  21. datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
  22. datahub_airflow_plugin/airflow2/_extractors.py +0 -477
  23. datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
  24. datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
  25. datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
  26. datahub_airflow_plugin/airflow2/_shims.py +0 -88
  27. datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
  28. datahub_airflow_plugin/airflow3/__init__.py +0 -6
  29. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
  30. datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
  31. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
  32. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
  33. datahub_airflow_plugin/airflow3/_shims.py +0 -82
  34. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
  35. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
  36. datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
  37. datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
  38. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
  39. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
  40. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
  41. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
  42. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
  43. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
  44. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
  45. datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
  46. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
  47. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
  48. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
  49. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
@@ -1,184 +0,0 @@
1
- """
2
- Version-specific utilities for Airflow 2 vs 3 compatibility.
3
- This module provides clean abstractions for version-specific behavior.
4
- """
5
-
6
- import logging
7
- from typing import TYPE_CHECKING, Dict
8
-
9
- import airflow
10
- import packaging.version
11
-
12
- if TYPE_CHECKING:
13
- from airflow.models import TaskInstance
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- # Version detection
18
- AIRFLOW_VERSION = packaging.version.parse(airflow.__version__)
19
- IS_AIRFLOW_3_OR_HIGHER = AIRFLOW_VERSION >= packaging.version.parse("3.0.0")
20
-
21
-
22
- def _get_duration_attribute(ti: "TaskInstance") -> Dict[str, str]:
23
- """
24
- Extract duration attribute, calculating it if necessary.
25
-
26
- Airflow 2.x has duration as a direct attribute.
27
- Airflow 3.x requires calculation from end_date - start_date.
28
- """
29
- if hasattr(ti, "duration"):
30
- return {"duration": str(ti.duration)}
31
-
32
- if (
33
- hasattr(ti, "end_date")
34
- and ti.end_date
35
- and hasattr(ti, "start_date")
36
- and ti.start_date
37
- ):
38
- try:
39
- duration_seconds = (ti.end_date - ti.start_date).total_seconds()
40
- return {"duration": str(duration_seconds)}
41
- except Exception as e:
42
- logger.debug(f"Could not calculate duration: {e}")
43
-
44
- return {}
45
-
46
-
47
- def _get_operator_attribute(ti: "TaskInstance") -> Dict[str, str]:
48
- """
49
- Extract operator name in a version-compatible way.
50
-
51
- In Airflow 2.x: Available as database column attribute ti.operator
52
- In Airflow 3.x (RuntimeTaskInstance): Must extract from ti.task.__class__.__name__
53
- """
54
- if hasattr(ti, "operator"):
55
- operator_from_db = str(ti.operator)
56
- logger.debug(
57
- f"Operator from ti.operator (DB): {operator_from_db}, "
58
- f"hasattr task: {hasattr(ti, 'task')}, "
59
- f"task class: {ti.task.__class__.__name__ if hasattr(ti, 'task') and ti.task else 'N/A'}"
60
- )
61
- return {"operator": operator_from_db}
62
-
63
- if hasattr(ti, "task") and ti.task is not None:
64
- try:
65
- return {"operator": ti.task.__class__.__name__}
66
- except Exception as e:
67
- logger.debug(f"Could not get operator name from task: {e}")
68
-
69
- return {}
70
-
71
-
72
- def _get_date_attributes(ti: "TaskInstance") -> Dict[str, str]:
73
- """
74
- Extract date-related attributes.
75
-
76
- Handles execution_date -> logical_date rename in Airflow 3.0.
77
- """
78
- attributes = {}
79
-
80
- if hasattr(ti, "end_date"):
81
- attributes["end_date"] = str(ti.end_date)
82
-
83
- if hasattr(ti, "execution_date"):
84
- attributes["execution_date"] = str(ti.execution_date)
85
- elif hasattr(ti, "logical_date"):
86
- attributes["logical_date"] = str(ti.logical_date)
87
-
88
- return attributes
89
-
90
-
91
- def get_task_instance_attributes(ti: "TaskInstance") -> Dict[str, str]:
92
- """
93
- Extract attributes from a TaskInstance in a version-compatible way.
94
-
95
- Airflow 3.0 introduced RuntimeTaskInstance which has different attributes
96
- than Airflow 2.x TaskInstance.
97
-
98
- Returns a dict of attribute name -> string value.
99
- """
100
- attributes = {}
101
-
102
- # Common attributes (both Airflow 2 and 3)
103
- if hasattr(ti, "run_id"):
104
- attributes["run_id"] = str(ti.run_id)
105
- if hasattr(ti, "start_date") and ti.start_date:
106
- attributes["start_date"] = str(ti.start_date)
107
- if hasattr(ti, "try_number"):
108
- attributes["try_number"] = str(ti.try_number - 1)
109
- if hasattr(ti, "state"):
110
- attributes["state"] = str(ti.state)
111
- if hasattr(ti, "task_id"):
112
- attributes["task_id"] = str(ti.task_id)
113
- if hasattr(ti, "dag_id"):
114
- attributes["dag_id"] = str(ti.dag_id)
115
-
116
- # Complex extractions via helper functions
117
- attributes.update(_get_duration_attribute(ti))
118
- attributes.update(_get_date_attributes(ti))
119
- attributes.update(_get_operator_attribute(ti))
120
-
121
- # Optional attributes
122
- if hasattr(ti, "max_tries"):
123
- attributes["max_tries"] = str(ti.max_tries)
124
- if hasattr(ti, "external_executor_id"):
125
- attributes["external_executor_id"] = str(ti.external_executor_id)
126
- if hasattr(ti, "priority_weight"):
127
- attributes["priority_weight"] = str(ti.priority_weight)
128
- if hasattr(ti, "log_url"):
129
- attributes["log_url"] = ti.log_url
130
-
131
- return attributes
132
-
133
-
134
- def get_airflow_compatible_dag_kwargs(**kwargs): # type: ignore[no-untyped-def]
135
- """
136
- Get DAG kwargs that are compatible with current Airflow version.
137
-
138
- Handles differences between Airflow 2.x and 3.x:
139
- - schedule_interval -> schedule in Airflow 3.0
140
- - default_view removed in Airflow 3.0
141
- - start_date handling
142
- """
143
- compatible_kwargs = kwargs.copy()
144
-
145
- if IS_AIRFLOW_3_OR_HIGHER:
146
- # Airflow 3.0 renamed schedule_interval to schedule
147
- if "schedule_interval" in compatible_kwargs:
148
- compatible_kwargs["schedule"] = compatible_kwargs.pop("schedule_interval")
149
-
150
- # Airflow 3.0 removed default_view
151
- if "default_view" in compatible_kwargs:
152
- del compatible_kwargs["default_view"]
153
-
154
- return compatible_kwargs # type: ignore[no-any-return]
155
-
156
-
157
- def days_ago(n: int): # type: ignore[no-untyped-def]
158
- """
159
- Compatibility helper for days_ago which was removed in Airflow 3.0.
160
-
161
- In Airflow 2.x, use airflow.utils.dates.days_ago()
162
- In Airflow 3.0, use datetime.datetime - datetime.timedelta
163
- """
164
- from datetime import datetime, timedelta, timezone
165
-
166
- if IS_AIRFLOW_3_OR_HIGHER:
167
- # Airflow 3.0: use datetime directly
168
- return datetime.now(timezone.utc) - timedelta(days=n)
169
- else:
170
- # Airflow 2.x: use the official helper
171
- from airflow.utils.dates import ( # type: ignore[attr-defined]
172
- days_ago as airflow_days_ago,
173
- )
174
-
175
- return airflow_days_ago(n) # type: ignore[no-any-return]
176
-
177
-
178
- __all__ = [
179
- "AIRFLOW_VERSION",
180
- "IS_AIRFLOW_3_OR_HIGHER",
181
- "get_task_instance_attributes",
182
- "get_airflow_compatible_dag_kwargs",
183
- "days_ago",
184
- ]
@@ -1,16 +0,0 @@
1
- """
2
- Shared constants for the DataHub Airflow plugin.
3
-
4
- This module centralizes constant values used across multiple modules
5
- to avoid duplication and ensure consistency.
6
- """
7
-
8
- # SQL parsing result keys for storing SQL lineage in OpenLineage facets
9
-
10
- # Key for DataHub's enhanced SQL parsing result (with column-level lineage)
11
- # Used in Airflow 3.x to pass results from SQLParser patch to DataHub listener
12
- DATAHUB_SQL_PARSING_RESULT_KEY = "datahub_sql_parsing_result"
13
-
14
- # Key for DataHub's SQL parsing result in Airflow 2.x extractors
15
- # Used to pass results from extractors to DataHub listener
16
- SQL_PARSING_RESULT_KEY = "datahub_sql"
@@ -1,6 +0,0 @@
1
- """
2
- DataHub Airflow Plugin v2 for Airflow 2.x.
3
-
4
- This module provides the DataHub listener implementation for Airflow 2.x,
5
- using the legacy openlineage-airflow package and extractor-based lineage.
6
- """
@@ -1,402 +0,0 @@
1
- """
2
- Patch for Airflow 2.10+ with apache-airflow-providers-openlineage SQLParser.
3
-
4
- When using apache-airflow-providers-openlineage with Airflow 2.10+, SQL operators call
5
- SQLParser.generate_openlineage_metadata_from_sql() directly rather than using extractors.
6
- This module patches that method to use DataHub's SQL parser, which provides better
7
- column-level lineage support.
8
-
9
- This is analogous to the Airflow 3 SQL parser patch, but adapted for Airflow 2.10+
10
- when the provider package is installed.
11
- """
12
-
13
- import logging
14
- from types import TracebackType
15
- from typing import TYPE_CHECKING, Any, Callable, Optional
16
-
17
- # Try importing from provider package (Airflow 2.10+ with apache-airflow-providers-openlineage)
18
- try:
19
- from airflow.providers.openlineage.extractors import OperatorLineage
20
- from airflow.providers.openlineage.sqlparser import DatabaseInfo
21
- from openlineage.client.event_v2 import Dataset as OpenLineageDataset
22
- from openlineage.client.facet import SqlJobFacet
23
-
24
- PROVIDER_IMPORTS_AVAILABLE = True
25
- except ImportError:
26
- # Provider package not available
27
- OperatorLineage = None # type: ignore[assignment,misc]
28
- DatabaseInfo = None # type: ignore[assignment,misc]
29
- OpenLineageDataset = None # type: ignore[assignment,misc]
30
- SqlJobFacet = None # type: ignore[assignment,misc]
31
- PROVIDER_IMPORTS_AVAILABLE = False
32
-
33
- # DataHub imports (always available)
34
- import datahub.emitter.mce_builder as builder
35
- from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
36
- from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
37
- from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
38
-
39
- if TYPE_CHECKING:
40
- from airflow.providers.openlineage.extractors import OperatorLineage
41
- from airflow.providers.openlineage.sqlparser import DatabaseInfo
42
- from openlineage.client.event_v2 import Dataset as OpenLineageDataset
43
- from openlineage.client.facet import SqlJobFacet
44
-
45
- logger = logging.getLogger(__name__)
46
-
47
- # Store the original SQLParser method for fallback
48
- _original_sql_parser_method: Optional[Callable[..., Any]] = None
49
-
50
-
51
- def _datahub_generate_openlineage_metadata_from_sql(
52
- self: Any,
53
- sql: Any,
54
- hook: Any,
55
- database_info: dict,
56
- database: Optional[str] = None,
57
- sqlalchemy_engine: Optional[Any] = None,
58
- use_connection: bool = True,
59
- ) -> Optional["OperatorLineage"]:
60
- """
61
- Override SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
62
-
63
- This is necessary because in Airflow 2.10+ with provider package, SQL operators call
64
- SQLParser directly rather than using extractors. We intercept this call and use
65
- DataHub's SQL parser to generate lineage with column-level lineage support.
66
-
67
- When OpenLineage plugin is enabled (disable_openlineage_plugin=False), we call both
68
- parsers: OpenLineage gets its own parsing results, while DataHub's enhanced parsing
69
- is stored in a custom facet for the DataHub listener to extract.
70
- """
71
- try:
72
- # Import here to avoid circular dependency
73
- from datahub_airflow_plugin._config import get_lineage_config
74
- from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
75
-
76
- # Check if OpenLineage plugin is enabled
77
- try:
78
- config = get_lineage_config()
79
- openlineage_enabled = not config.disable_openlineage_plugin
80
- except Exception as e:
81
- logger.warning(
82
- f"Could not load config to check disable_openlineage_plugin: {e}"
83
- )
84
- openlineage_enabled = False
85
-
86
- # If OpenLineage is enabled, call the original parser first to get its results
87
- ol_result = None
88
- if openlineage_enabled and _original_sql_parser_method is not None:
89
- try:
90
- logger.debug(
91
- "OpenLineage plugin enabled - calling original parser for OpenLineage"
92
- )
93
- ol_result = _original_sql_parser_method(
94
- self,
95
- sql,
96
- hook,
97
- database_info,
98
- database,
99
- sqlalchemy_engine,
100
- use_connection,
101
- )
102
- logger.debug(f"OpenLineage parser result: {ol_result}")
103
- except Exception as e:
104
- logger.warning(
105
- f"Error calling original OpenLineage parser, will use only DataHub parser: {e}",
106
- exc_info=True,
107
- )
108
-
109
- # Handle missing database_info by creating a minimal one from connection
110
- if database_info is None:
111
- # Get basic properties from hook's connection
112
- conn = getattr(hook, "get_connection", lambda: None)()
113
- scheme = getattr(conn, "conn_type", None) if conn else None
114
- db_name = getattr(conn, "schema", None) if conn else None
115
-
116
- database_info = DatabaseInfo(
117
- scheme=scheme,
118
- authority=None,
119
- database=db_name,
120
- information_schema_columns=[],
121
- information_schema_table_name="",
122
- use_flat_cross_db_query=False,
123
- is_information_schema_cross_db=False,
124
- is_uppercase_names=False,
125
- normalize_name_method=lambda x: x.lower(),
126
- )
127
- logger.debug(
128
- f"Created minimal DatabaseInfo from connection: scheme={scheme}, database={db_name}"
129
- )
130
-
131
- # Get platform from dialect or from database_info scheme
132
- # If dialect is "generic", prefer database_info.scheme (connection type)
133
- platform = self.dialect or "sql"
134
- if platform == "generic" and database_info:
135
- # Use the actual connection type instead of "generic"
136
- platform = getattr(database_info, "scheme", platform) or platform
137
- if platform == "generic":
138
- raise ValueError(
139
- "Could not determine platform from generic dialect or database_info"
140
- )
141
-
142
- platform = OL_SCHEME_TWEAKS.get(platform, platform)
143
-
144
- # Get default database and schema
145
- default_database = database or getattr(database_info, "database", None)
146
- default_schema = self.default_schema
147
-
148
- # Handle list of SQL statements
149
- if isinstance(sql, list):
150
- logger.debug("Got list of SQL statements. Using first one for parsing.")
151
- sql = sql[0] if sql else ""
152
-
153
- # Run DataHub's SQL parser
154
- listener = get_airflow_plugin_listener()
155
- graph = listener.graph if listener else None
156
-
157
- logger.debug(
158
- "Running DataHub SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
159
- "with graph client" if graph else "in offline mode",
160
- platform,
161
- default_database,
162
- default_schema,
163
- sql,
164
- )
165
-
166
- sql_parsing_result = create_lineage_sql_parsed_result(
167
- query=sql,
168
- graph=graph,
169
- platform=platform,
170
- platform_instance=None,
171
- env=builder.DEFAULT_ENV,
172
- default_db=default_database,
173
- default_schema=default_schema,
174
- )
175
-
176
- logger.debug(f"DataHub SQL parser result: {sql_parsing_result}")
177
-
178
- # Store the sql_parsing_result in run_facets for later retrieval by the DataHub listener
179
- # If OpenLineage plugin is enabled and we got a result from the original parser,
180
- # use OpenLineage's result but add DataHub's parsing to the facets
181
- if ol_result is not None:
182
- logger.debug(
183
- "Using OpenLineage parser result for OperatorLineage, "
184
- "adding DataHub parsing to run_facets"
185
- )
186
- # Add DataHub's SQL parsing result to the existing run_facets
187
- # OperatorLineage is frozen (uses @define), so we need to create a new dict
188
- updated_run_facets = dict(ol_result.run_facets or {})
189
- updated_run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = sql_parsing_result
190
-
191
- # Create new OperatorLineage with OpenLineage's inputs/outputs but DataHub's facet
192
- operator_lineage = OperatorLineage( # type: ignore[misc]
193
- inputs=ol_result.inputs,
194
- outputs=ol_result.outputs,
195
- job_facets=ol_result.job_facets,
196
- run_facets=updated_run_facets,
197
- )
198
- return operator_lineage
199
-
200
- # OpenLineage is disabled or original parser failed - use DataHub's parsing for everything
201
- logger.debug(
202
- "OpenLineage plugin disabled or parser unavailable - "
203
- "using DataHub parser result for OperatorLineage"
204
- )
205
-
206
- # Convert DataHub URNs to OpenLineage Dataset objects
207
- def _urn_to_ol_dataset(urn: str) -> "OpenLineageDataset":
208
- """Convert DataHub URN to OpenLineage Dataset format."""
209
- # Parse URN to extract database, schema, table
210
- # URN format: urn:li:dataset:(urn:li:dataPlatform:{platform},{database}.{schema}.{table},{env})
211
- try:
212
- parts = urn.split(",")
213
- if len(parts) >= 2:
214
- # Extract table path from URN
215
- table_path = parts[1] # e.g., "database.schema.table"
216
-
217
- # Create OL namespace and name
218
- # For now, use platform as namespace and full path as name
219
- namespace = f"{platform}://{default_database or 'default'}"
220
- name = table_path
221
-
222
- return OpenLineageDataset(namespace=namespace, name=name)
223
- except Exception as e:
224
- logger.debug(f"Error converting URN {urn} to OL Dataset: {e}")
225
-
226
- # Fallback: use URN as name
227
- return OpenLineageDataset(namespace=f"{platform}://default", name=urn)
228
-
229
- inputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.in_tables]
230
- outputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.out_tables]
231
-
232
- run_facets = {DATAHUB_SQL_PARSING_RESULT_KEY: sql_parsing_result}
233
-
234
- # Create OperatorLineage with DataHub's results
235
- operator_lineage = OperatorLineage( # type: ignore[misc]
236
- inputs=inputs,
237
- outputs=outputs,
238
- job_facets={"sql": SqlJobFacet(query=sql)},
239
- run_facets=run_facets,
240
- )
241
- return operator_lineage
242
-
243
- except Exception as e:
244
- logger.warning(
245
- f"Error in DataHub SQL parser, falling back to default OpenLineage parser: {e}",
246
- exc_info=True,
247
- )
248
- # Fall back to original implementation
249
- if _original_sql_parser_method is None:
250
- raise RuntimeError(
251
- "Original SQLParser method not stored. patch_sqlparser() may not have been called."
252
- ) from None
253
- return _original_sql_parser_method(
254
- self, sql, hook, database_info, database, sqlalchemy_engine, use_connection
255
- )
256
-
257
-
258
- class SQLParserPatch:
259
- """
260
- Context manager for patching Airflow's SQLParser with DataHub's SQL parser.
261
-
262
- This class encapsulates the patching logic and manages the global state properly.
263
- It can be used as a context manager for automatic cleanup, or with explicit
264
- patch/unpatch methods for manual control.
265
-
266
- Usage:
267
- # As a context manager (recommended for testing)
268
- with SQLParserPatch():
269
- # Code runs with patched SQLParser
270
- pass
271
- # Automatically unpatched on exit
272
-
273
- # Or with explicit control
274
- patcher = SQLParserPatch()
275
- patcher.patch()
276
- try:
277
- # ... plugin lifetime ...
278
- finally:
279
- patcher.unpatch()
280
-
281
- The patch stores the original SQLParser method and replaces it with DataHub's
282
- enhanced implementation that provides column-level lineage support.
283
- """
284
-
285
- def patch(self) -> "SQLParserPatch":
286
- """
287
- Apply the SQLParser patch.
288
-
289
- Stores the original SQLParser.generate_openlineage_metadata_from_sql method
290
- and replaces it with DataHub's enhanced implementation.
291
-
292
- Returns:
293
- self for method chaining
294
- """
295
- global _original_sql_parser_method
296
-
297
- try:
298
- from airflow.providers.openlineage.sqlparser import SQLParser
299
-
300
- # Store original method for fallback (only if not already patched)
301
- if _original_sql_parser_method is None:
302
- _original_sql_parser_method = (
303
- SQLParser.generate_openlineage_metadata_from_sql
304
- )
305
-
306
- SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[assignment,method-assign]
307
- _datahub_generate_openlineage_metadata_from_sql # type: ignore[assignment,method-assign]
308
- )
309
- logger.debug(
310
- "Patched SQLParser.generate_openlineage_metadata_from_sql with DataHub SQL parser"
311
- )
312
-
313
- except ImportError:
314
- # SQLParser not available (provider package not installed or Airflow < 2.10)
315
- logger.debug(
316
- "SQLParser not available, skipping patch (likely Airflow < 2.10 or provider package not installed)"
317
- )
318
-
319
- return self
320
-
321
- def unpatch(self) -> "SQLParserPatch":
322
- """
323
- Remove the SQLParser patch and restore the original method.
324
-
325
- This is primarily useful for testing to ensure clean state between tests.
326
- In production, the patch typically stays active for the process lifetime.
327
-
328
- Returns:
329
- self for method chaining
330
- """
331
- global _original_sql_parser_method
332
-
333
- if _original_sql_parser_method is None:
334
- logger.debug("SQLParser not patched, nothing to unpatch")
335
- return self
336
-
337
- try:
338
- from airflow.providers.openlineage.sqlparser import SQLParser
339
-
340
- # Restore original method
341
- SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[method-assign]
342
- _original_sql_parser_method
343
- )
344
- logger.debug("Unpatched SQLParser, restored original method")
345
-
346
- except ImportError:
347
- logger.debug("SQLParser not available, nothing to unpatch")
348
- finally:
349
- # Clear the stored reference to allow re-patching
350
- _original_sql_parser_method = None
351
-
352
- return self
353
-
354
- def __enter__(self) -> "SQLParserPatch":
355
- """Context manager entry: apply the patch."""
356
- return self.patch()
357
-
358
- def __exit__(
359
- self,
360
- exc_type: Optional[type[BaseException]],
361
- exc_val: Optional[BaseException],
362
- exc_tb: Optional[TracebackType],
363
- ) -> None:
364
- """Context manager exit: remove the patch."""
365
- self.unpatch()
366
-
367
-
368
- # Global patcher instance for backward compatibility
369
- _global_patcher = SQLParserPatch()
370
-
371
-
372
- def patch_sqlparser() -> None:
373
- """
374
- Patch SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
375
-
376
- This is a convenience function that wraps SQLParserPatch.patch() for backward
377
- compatibility with existing code.
378
-
379
- This should be called early in the plugin initialization, before any SQL operators are used.
380
-
381
- When both DataHub and OpenLineage plugins are enabled (disable_openlineage_plugin=False),
382
- the patch calls BOTH parsers:
383
- - OpenLineage's original parser provides inputs/outputs for OpenLineage plugin
384
- - DataHub's enhanced parser (with column-level lineage) is stored in run_facets
385
- for DataHub listener to extract
386
-
387
- When only DataHub is enabled (disable_openlineage_plugin=True), only DataHub's
388
- parser runs and provides both the OperatorLineage structure and the enhanced parsing.
389
- """
390
- _global_patcher.patch()
391
-
392
-
393
- def unpatch_sqlparser() -> None:
394
- """
395
- Remove the SQLParser patch and restore the original method.
396
-
397
- This is a convenience function that wraps SQLParserPatch.unpatch() for consistency.
398
-
399
- This is primarily useful for testing to ensure clean state between tests.
400
- In production, the patch typically stays active for the process lifetime.
401
- """
402
- _global_patcher.unpatch()