acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
- datahub_airflow_plugin/_airflow_shims.py +31 -64
- datahub_airflow_plugin/_config.py +19 -97
- datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
- datahub_airflow_plugin/_extractors.py +365 -0
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/client/airflow_generator.py +43 -147
- datahub_airflow_plugin/datahub_listener.py +790 -19
- datahub_airflow_plugin/example_dags/__init__.py +0 -32
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
- datahub_airflow_plugin/hooks/datahub.py +2 -11
- datahub_airflow_plugin/operators/datahub.py +3 -20
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
- datahub_airflow_plugin/_airflow_compat.py +0 -32
- datahub_airflow_plugin/_airflow_version_specific.py +0 -184
- datahub_airflow_plugin/_constants.py +0 -16
- datahub_airflow_plugin/airflow2/__init__.py +0 -6
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
- datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
- datahub_airflow_plugin/airflow2/_extractors.py +0 -477
- datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
- datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
- datahub_airflow_plugin/airflow2/_shims.py +0 -88
- datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
- datahub_airflow_plugin/airflow3/__init__.py +0 -6
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
- datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
- datahub_airflow_plugin/airflow3/_shims.py +0 -82
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
- datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Version-specific utilities for Airflow 2 vs 3 compatibility.
|
|
3
|
-
This module provides clean abstractions for version-specific behavior.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from typing import TYPE_CHECKING, Dict
|
|
8
|
-
|
|
9
|
-
import airflow
|
|
10
|
-
import packaging.version
|
|
11
|
-
|
|
12
|
-
if TYPE_CHECKING:
|
|
13
|
-
from airflow.models import TaskInstance
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
# Version detection
|
|
18
|
-
AIRFLOW_VERSION = packaging.version.parse(airflow.__version__)
|
|
19
|
-
IS_AIRFLOW_3_OR_HIGHER = AIRFLOW_VERSION >= packaging.version.parse("3.0.0")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _get_duration_attribute(ti: "TaskInstance") -> Dict[str, str]:
|
|
23
|
-
"""
|
|
24
|
-
Extract duration attribute, calculating it if necessary.
|
|
25
|
-
|
|
26
|
-
Airflow 2.x has duration as a direct attribute.
|
|
27
|
-
Airflow 3.x requires calculation from end_date - start_date.
|
|
28
|
-
"""
|
|
29
|
-
if hasattr(ti, "duration"):
|
|
30
|
-
return {"duration": str(ti.duration)}
|
|
31
|
-
|
|
32
|
-
if (
|
|
33
|
-
hasattr(ti, "end_date")
|
|
34
|
-
and ti.end_date
|
|
35
|
-
and hasattr(ti, "start_date")
|
|
36
|
-
and ti.start_date
|
|
37
|
-
):
|
|
38
|
-
try:
|
|
39
|
-
duration_seconds = (ti.end_date - ti.start_date).total_seconds()
|
|
40
|
-
return {"duration": str(duration_seconds)}
|
|
41
|
-
except Exception as e:
|
|
42
|
-
logger.debug(f"Could not calculate duration: {e}")
|
|
43
|
-
|
|
44
|
-
return {}
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def _get_operator_attribute(ti: "TaskInstance") -> Dict[str, str]:
|
|
48
|
-
"""
|
|
49
|
-
Extract operator name in a version-compatible way.
|
|
50
|
-
|
|
51
|
-
In Airflow 2.x: Available as database column attribute ti.operator
|
|
52
|
-
In Airflow 3.x (RuntimeTaskInstance): Must extract from ti.task.__class__.__name__
|
|
53
|
-
"""
|
|
54
|
-
if hasattr(ti, "operator"):
|
|
55
|
-
operator_from_db = str(ti.operator)
|
|
56
|
-
logger.debug(
|
|
57
|
-
f"Operator from ti.operator (DB): {operator_from_db}, "
|
|
58
|
-
f"hasattr task: {hasattr(ti, 'task')}, "
|
|
59
|
-
f"task class: {ti.task.__class__.__name__ if hasattr(ti, 'task') and ti.task else 'N/A'}"
|
|
60
|
-
)
|
|
61
|
-
return {"operator": operator_from_db}
|
|
62
|
-
|
|
63
|
-
if hasattr(ti, "task") and ti.task is not None:
|
|
64
|
-
try:
|
|
65
|
-
return {"operator": ti.task.__class__.__name__}
|
|
66
|
-
except Exception as e:
|
|
67
|
-
logger.debug(f"Could not get operator name from task: {e}")
|
|
68
|
-
|
|
69
|
-
return {}
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _get_date_attributes(ti: "TaskInstance") -> Dict[str, str]:
|
|
73
|
-
"""
|
|
74
|
-
Extract date-related attributes.
|
|
75
|
-
|
|
76
|
-
Handles execution_date -> logical_date rename in Airflow 3.0.
|
|
77
|
-
"""
|
|
78
|
-
attributes = {}
|
|
79
|
-
|
|
80
|
-
if hasattr(ti, "end_date"):
|
|
81
|
-
attributes["end_date"] = str(ti.end_date)
|
|
82
|
-
|
|
83
|
-
if hasattr(ti, "execution_date"):
|
|
84
|
-
attributes["execution_date"] = str(ti.execution_date)
|
|
85
|
-
elif hasattr(ti, "logical_date"):
|
|
86
|
-
attributes["logical_date"] = str(ti.logical_date)
|
|
87
|
-
|
|
88
|
-
return attributes
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def get_task_instance_attributes(ti: "TaskInstance") -> Dict[str, str]:
|
|
92
|
-
"""
|
|
93
|
-
Extract attributes from a TaskInstance in a version-compatible way.
|
|
94
|
-
|
|
95
|
-
Airflow 3.0 introduced RuntimeTaskInstance which has different attributes
|
|
96
|
-
than Airflow 2.x TaskInstance.
|
|
97
|
-
|
|
98
|
-
Returns a dict of attribute name -> string value.
|
|
99
|
-
"""
|
|
100
|
-
attributes = {}
|
|
101
|
-
|
|
102
|
-
# Common attributes (both Airflow 2 and 3)
|
|
103
|
-
if hasattr(ti, "run_id"):
|
|
104
|
-
attributes["run_id"] = str(ti.run_id)
|
|
105
|
-
if hasattr(ti, "start_date") and ti.start_date:
|
|
106
|
-
attributes["start_date"] = str(ti.start_date)
|
|
107
|
-
if hasattr(ti, "try_number"):
|
|
108
|
-
attributes["try_number"] = str(ti.try_number - 1)
|
|
109
|
-
if hasattr(ti, "state"):
|
|
110
|
-
attributes["state"] = str(ti.state)
|
|
111
|
-
if hasattr(ti, "task_id"):
|
|
112
|
-
attributes["task_id"] = str(ti.task_id)
|
|
113
|
-
if hasattr(ti, "dag_id"):
|
|
114
|
-
attributes["dag_id"] = str(ti.dag_id)
|
|
115
|
-
|
|
116
|
-
# Complex extractions via helper functions
|
|
117
|
-
attributes.update(_get_duration_attribute(ti))
|
|
118
|
-
attributes.update(_get_date_attributes(ti))
|
|
119
|
-
attributes.update(_get_operator_attribute(ti))
|
|
120
|
-
|
|
121
|
-
# Optional attributes
|
|
122
|
-
if hasattr(ti, "max_tries"):
|
|
123
|
-
attributes["max_tries"] = str(ti.max_tries)
|
|
124
|
-
if hasattr(ti, "external_executor_id"):
|
|
125
|
-
attributes["external_executor_id"] = str(ti.external_executor_id)
|
|
126
|
-
if hasattr(ti, "priority_weight"):
|
|
127
|
-
attributes["priority_weight"] = str(ti.priority_weight)
|
|
128
|
-
if hasattr(ti, "log_url"):
|
|
129
|
-
attributes["log_url"] = ti.log_url
|
|
130
|
-
|
|
131
|
-
return attributes
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def get_airflow_compatible_dag_kwargs(**kwargs): # type: ignore[no-untyped-def]
|
|
135
|
-
"""
|
|
136
|
-
Get DAG kwargs that are compatible with current Airflow version.
|
|
137
|
-
|
|
138
|
-
Handles differences between Airflow 2.x and 3.x:
|
|
139
|
-
- schedule_interval -> schedule in Airflow 3.0
|
|
140
|
-
- default_view removed in Airflow 3.0
|
|
141
|
-
- start_date handling
|
|
142
|
-
"""
|
|
143
|
-
compatible_kwargs = kwargs.copy()
|
|
144
|
-
|
|
145
|
-
if IS_AIRFLOW_3_OR_HIGHER:
|
|
146
|
-
# Airflow 3.0 renamed schedule_interval to schedule
|
|
147
|
-
if "schedule_interval" in compatible_kwargs:
|
|
148
|
-
compatible_kwargs["schedule"] = compatible_kwargs.pop("schedule_interval")
|
|
149
|
-
|
|
150
|
-
# Airflow 3.0 removed default_view
|
|
151
|
-
if "default_view" in compatible_kwargs:
|
|
152
|
-
del compatible_kwargs["default_view"]
|
|
153
|
-
|
|
154
|
-
return compatible_kwargs # type: ignore[no-any-return]
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def days_ago(n: int): # type: ignore[no-untyped-def]
|
|
158
|
-
"""
|
|
159
|
-
Compatibility helper for days_ago which was removed in Airflow 3.0.
|
|
160
|
-
|
|
161
|
-
In Airflow 2.x, use airflow.utils.dates.days_ago()
|
|
162
|
-
In Airflow 3.0, use datetime.datetime - datetime.timedelta
|
|
163
|
-
"""
|
|
164
|
-
from datetime import datetime, timedelta, timezone
|
|
165
|
-
|
|
166
|
-
if IS_AIRFLOW_3_OR_HIGHER:
|
|
167
|
-
# Airflow 3.0: use datetime directly
|
|
168
|
-
return datetime.now(timezone.utc) - timedelta(days=n)
|
|
169
|
-
else:
|
|
170
|
-
# Airflow 2.x: use the official helper
|
|
171
|
-
from airflow.utils.dates import ( # type: ignore[attr-defined]
|
|
172
|
-
days_ago as airflow_days_ago,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
return airflow_days_ago(n) # type: ignore[no-any-return]
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
__all__ = [
|
|
179
|
-
"AIRFLOW_VERSION",
|
|
180
|
-
"IS_AIRFLOW_3_OR_HIGHER",
|
|
181
|
-
"get_task_instance_attributes",
|
|
182
|
-
"get_airflow_compatible_dag_kwargs",
|
|
183
|
-
"days_ago",
|
|
184
|
-
]
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Shared constants for the DataHub Airflow plugin.
|
|
3
|
-
|
|
4
|
-
This module centralizes constant values used across multiple modules
|
|
5
|
-
to avoid duplication and ensure consistency.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# SQL parsing result keys for storing SQL lineage in OpenLineage facets
|
|
9
|
-
|
|
10
|
-
# Key for DataHub's enhanced SQL parsing result (with column-level lineage)
|
|
11
|
-
# Used in Airflow 3.x to pass results from SQLParser patch to DataHub listener
|
|
12
|
-
DATAHUB_SQL_PARSING_RESULT_KEY = "datahub_sql_parsing_result"
|
|
13
|
-
|
|
14
|
-
# Key for DataHub's SQL parsing result in Airflow 2.x extractors
|
|
15
|
-
# Used to pass results from extractors to DataHub listener
|
|
16
|
-
SQL_PARSING_RESULT_KEY = "datahub_sql"
|
|
@@ -1,402 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Patch for Airflow 2.10+ with apache-airflow-providers-openlineage SQLParser.
|
|
3
|
-
|
|
4
|
-
When using apache-airflow-providers-openlineage with Airflow 2.10+, SQL operators call
|
|
5
|
-
SQLParser.generate_openlineage_metadata_from_sql() directly rather than using extractors.
|
|
6
|
-
This module patches that method to use DataHub's SQL parser, which provides better
|
|
7
|
-
column-level lineage support.
|
|
8
|
-
|
|
9
|
-
This is analogous to the Airflow 3 SQL parser patch, but adapted for Airflow 2.10+
|
|
10
|
-
when the provider package is installed.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import logging
|
|
14
|
-
from types import TracebackType
|
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
16
|
-
|
|
17
|
-
# Try importing from provider package (Airflow 2.10+ with apache-airflow-providers-openlineage)
|
|
18
|
-
try:
|
|
19
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
20
|
-
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
21
|
-
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
22
|
-
from openlineage.client.facet import SqlJobFacet
|
|
23
|
-
|
|
24
|
-
PROVIDER_IMPORTS_AVAILABLE = True
|
|
25
|
-
except ImportError:
|
|
26
|
-
# Provider package not available
|
|
27
|
-
OperatorLineage = None # type: ignore[assignment,misc]
|
|
28
|
-
DatabaseInfo = None # type: ignore[assignment,misc]
|
|
29
|
-
OpenLineageDataset = None # type: ignore[assignment,misc]
|
|
30
|
-
SqlJobFacet = None # type: ignore[assignment,misc]
|
|
31
|
-
PROVIDER_IMPORTS_AVAILABLE = False
|
|
32
|
-
|
|
33
|
-
# DataHub imports (always available)
|
|
34
|
-
import datahub.emitter.mce_builder as builder
|
|
35
|
-
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
36
|
-
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
37
|
-
from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
|
|
38
|
-
|
|
39
|
-
if TYPE_CHECKING:
|
|
40
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
41
|
-
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
42
|
-
from openlineage.client.event_v2 import Dataset as OpenLineageDataset
|
|
43
|
-
from openlineage.client.facet import SqlJobFacet
|
|
44
|
-
|
|
45
|
-
logger = logging.getLogger(__name__)
|
|
46
|
-
|
|
47
|
-
# Store the original SQLParser method for fallback
|
|
48
|
-
_original_sql_parser_method: Optional[Callable[..., Any]] = None
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def _datahub_generate_openlineage_metadata_from_sql(
|
|
52
|
-
self: Any,
|
|
53
|
-
sql: Any,
|
|
54
|
-
hook: Any,
|
|
55
|
-
database_info: dict,
|
|
56
|
-
database: Optional[str] = None,
|
|
57
|
-
sqlalchemy_engine: Optional[Any] = None,
|
|
58
|
-
use_connection: bool = True,
|
|
59
|
-
) -> Optional["OperatorLineage"]:
|
|
60
|
-
"""
|
|
61
|
-
Override SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
62
|
-
|
|
63
|
-
This is necessary because in Airflow 2.10+ with provider package, SQL operators call
|
|
64
|
-
SQLParser directly rather than using extractors. We intercept this call and use
|
|
65
|
-
DataHub's SQL parser to generate lineage with column-level lineage support.
|
|
66
|
-
|
|
67
|
-
When OpenLineage plugin is enabled (disable_openlineage_plugin=False), we call both
|
|
68
|
-
parsers: OpenLineage gets its own parsing results, while DataHub's enhanced parsing
|
|
69
|
-
is stored in a custom facet for the DataHub listener to extract.
|
|
70
|
-
"""
|
|
71
|
-
try:
|
|
72
|
-
# Import here to avoid circular dependency
|
|
73
|
-
from datahub_airflow_plugin._config import get_lineage_config
|
|
74
|
-
from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
|
|
75
|
-
|
|
76
|
-
# Check if OpenLineage plugin is enabled
|
|
77
|
-
try:
|
|
78
|
-
config = get_lineage_config()
|
|
79
|
-
openlineage_enabled = not config.disable_openlineage_plugin
|
|
80
|
-
except Exception as e:
|
|
81
|
-
logger.warning(
|
|
82
|
-
f"Could not load config to check disable_openlineage_plugin: {e}"
|
|
83
|
-
)
|
|
84
|
-
openlineage_enabled = False
|
|
85
|
-
|
|
86
|
-
# If OpenLineage is enabled, call the original parser first to get its results
|
|
87
|
-
ol_result = None
|
|
88
|
-
if openlineage_enabled and _original_sql_parser_method is not None:
|
|
89
|
-
try:
|
|
90
|
-
logger.debug(
|
|
91
|
-
"OpenLineage plugin enabled - calling original parser for OpenLineage"
|
|
92
|
-
)
|
|
93
|
-
ol_result = _original_sql_parser_method(
|
|
94
|
-
self,
|
|
95
|
-
sql,
|
|
96
|
-
hook,
|
|
97
|
-
database_info,
|
|
98
|
-
database,
|
|
99
|
-
sqlalchemy_engine,
|
|
100
|
-
use_connection,
|
|
101
|
-
)
|
|
102
|
-
logger.debug(f"OpenLineage parser result: {ol_result}")
|
|
103
|
-
except Exception as e:
|
|
104
|
-
logger.warning(
|
|
105
|
-
f"Error calling original OpenLineage parser, will use only DataHub parser: {e}",
|
|
106
|
-
exc_info=True,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Handle missing database_info by creating a minimal one from connection
|
|
110
|
-
if database_info is None:
|
|
111
|
-
# Get basic properties from hook's connection
|
|
112
|
-
conn = getattr(hook, "get_connection", lambda: None)()
|
|
113
|
-
scheme = getattr(conn, "conn_type", None) if conn else None
|
|
114
|
-
db_name = getattr(conn, "schema", None) if conn else None
|
|
115
|
-
|
|
116
|
-
database_info = DatabaseInfo(
|
|
117
|
-
scheme=scheme,
|
|
118
|
-
authority=None,
|
|
119
|
-
database=db_name,
|
|
120
|
-
information_schema_columns=[],
|
|
121
|
-
information_schema_table_name="",
|
|
122
|
-
use_flat_cross_db_query=False,
|
|
123
|
-
is_information_schema_cross_db=False,
|
|
124
|
-
is_uppercase_names=False,
|
|
125
|
-
normalize_name_method=lambda x: x.lower(),
|
|
126
|
-
)
|
|
127
|
-
logger.debug(
|
|
128
|
-
f"Created minimal DatabaseInfo from connection: scheme={scheme}, database={db_name}"
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
# Get platform from dialect or from database_info scheme
|
|
132
|
-
# If dialect is "generic", prefer database_info.scheme (connection type)
|
|
133
|
-
platform = self.dialect or "sql"
|
|
134
|
-
if platform == "generic" and database_info:
|
|
135
|
-
# Use the actual connection type instead of "generic"
|
|
136
|
-
platform = getattr(database_info, "scheme", platform) or platform
|
|
137
|
-
if platform == "generic":
|
|
138
|
-
raise ValueError(
|
|
139
|
-
"Could not determine platform from generic dialect or database_info"
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
platform = OL_SCHEME_TWEAKS.get(platform, platform)
|
|
143
|
-
|
|
144
|
-
# Get default database and schema
|
|
145
|
-
default_database = database or getattr(database_info, "database", None)
|
|
146
|
-
default_schema = self.default_schema
|
|
147
|
-
|
|
148
|
-
# Handle list of SQL statements
|
|
149
|
-
if isinstance(sql, list):
|
|
150
|
-
logger.debug("Got list of SQL statements. Using first one for parsing.")
|
|
151
|
-
sql = sql[0] if sql else ""
|
|
152
|
-
|
|
153
|
-
# Run DataHub's SQL parser
|
|
154
|
-
listener = get_airflow_plugin_listener()
|
|
155
|
-
graph = listener.graph if listener else None
|
|
156
|
-
|
|
157
|
-
logger.debug(
|
|
158
|
-
"Running DataHub SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
|
|
159
|
-
"with graph client" if graph else "in offline mode",
|
|
160
|
-
platform,
|
|
161
|
-
default_database,
|
|
162
|
-
default_schema,
|
|
163
|
-
sql,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
167
|
-
query=sql,
|
|
168
|
-
graph=graph,
|
|
169
|
-
platform=platform,
|
|
170
|
-
platform_instance=None,
|
|
171
|
-
env=builder.DEFAULT_ENV,
|
|
172
|
-
default_db=default_database,
|
|
173
|
-
default_schema=default_schema,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
logger.debug(f"DataHub SQL parser result: {sql_parsing_result}")
|
|
177
|
-
|
|
178
|
-
# Store the sql_parsing_result in run_facets for later retrieval by the DataHub listener
|
|
179
|
-
# If OpenLineage plugin is enabled and we got a result from the original parser,
|
|
180
|
-
# use OpenLineage's result but add DataHub's parsing to the facets
|
|
181
|
-
if ol_result is not None:
|
|
182
|
-
logger.debug(
|
|
183
|
-
"Using OpenLineage parser result for OperatorLineage, "
|
|
184
|
-
"adding DataHub parsing to run_facets"
|
|
185
|
-
)
|
|
186
|
-
# Add DataHub's SQL parsing result to the existing run_facets
|
|
187
|
-
# OperatorLineage is frozen (uses @define), so we need to create a new dict
|
|
188
|
-
updated_run_facets = dict(ol_result.run_facets or {})
|
|
189
|
-
updated_run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = sql_parsing_result
|
|
190
|
-
|
|
191
|
-
# Create new OperatorLineage with OpenLineage's inputs/outputs but DataHub's facet
|
|
192
|
-
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
193
|
-
inputs=ol_result.inputs,
|
|
194
|
-
outputs=ol_result.outputs,
|
|
195
|
-
job_facets=ol_result.job_facets,
|
|
196
|
-
run_facets=updated_run_facets,
|
|
197
|
-
)
|
|
198
|
-
return operator_lineage
|
|
199
|
-
|
|
200
|
-
# OpenLineage is disabled or original parser failed - use DataHub's parsing for everything
|
|
201
|
-
logger.debug(
|
|
202
|
-
"OpenLineage plugin disabled or parser unavailable - "
|
|
203
|
-
"using DataHub parser result for OperatorLineage"
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
# Convert DataHub URNs to OpenLineage Dataset objects
|
|
207
|
-
def _urn_to_ol_dataset(urn: str) -> "OpenLineageDataset":
|
|
208
|
-
"""Convert DataHub URN to OpenLineage Dataset format."""
|
|
209
|
-
# Parse URN to extract database, schema, table
|
|
210
|
-
# URN format: urn:li:dataset:(urn:li:dataPlatform:{platform},{database}.{schema}.{table},{env})
|
|
211
|
-
try:
|
|
212
|
-
parts = urn.split(",")
|
|
213
|
-
if len(parts) >= 2:
|
|
214
|
-
# Extract table path from URN
|
|
215
|
-
table_path = parts[1] # e.g., "database.schema.table"
|
|
216
|
-
|
|
217
|
-
# Create OL namespace and name
|
|
218
|
-
# For now, use platform as namespace and full path as name
|
|
219
|
-
namespace = f"{platform}://{default_database or 'default'}"
|
|
220
|
-
name = table_path
|
|
221
|
-
|
|
222
|
-
return OpenLineageDataset(namespace=namespace, name=name)
|
|
223
|
-
except Exception as e:
|
|
224
|
-
logger.debug(f"Error converting URN {urn} to OL Dataset: {e}")
|
|
225
|
-
|
|
226
|
-
# Fallback: use URN as name
|
|
227
|
-
return OpenLineageDataset(namespace=f"{platform}://default", name=urn)
|
|
228
|
-
|
|
229
|
-
inputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.in_tables]
|
|
230
|
-
outputs = [_urn_to_ol_dataset(urn) for urn in sql_parsing_result.out_tables]
|
|
231
|
-
|
|
232
|
-
run_facets = {DATAHUB_SQL_PARSING_RESULT_KEY: sql_parsing_result}
|
|
233
|
-
|
|
234
|
-
# Create OperatorLineage with DataHub's results
|
|
235
|
-
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
236
|
-
inputs=inputs,
|
|
237
|
-
outputs=outputs,
|
|
238
|
-
job_facets={"sql": SqlJobFacet(query=sql)},
|
|
239
|
-
run_facets=run_facets,
|
|
240
|
-
)
|
|
241
|
-
return operator_lineage
|
|
242
|
-
|
|
243
|
-
except Exception as e:
|
|
244
|
-
logger.warning(
|
|
245
|
-
f"Error in DataHub SQL parser, falling back to default OpenLineage parser: {e}",
|
|
246
|
-
exc_info=True,
|
|
247
|
-
)
|
|
248
|
-
# Fall back to original implementation
|
|
249
|
-
if _original_sql_parser_method is None:
|
|
250
|
-
raise RuntimeError(
|
|
251
|
-
"Original SQLParser method not stored. patch_sqlparser() may not have been called."
|
|
252
|
-
) from None
|
|
253
|
-
return _original_sql_parser_method(
|
|
254
|
-
self, sql, hook, database_info, database, sqlalchemy_engine, use_connection
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
class SQLParserPatch:
|
|
259
|
-
"""
|
|
260
|
-
Context manager for patching Airflow's SQLParser with DataHub's SQL parser.
|
|
261
|
-
|
|
262
|
-
This class encapsulates the patching logic and manages the global state properly.
|
|
263
|
-
It can be used as a context manager for automatic cleanup, or with explicit
|
|
264
|
-
patch/unpatch methods for manual control.
|
|
265
|
-
|
|
266
|
-
Usage:
|
|
267
|
-
# As a context manager (recommended for testing)
|
|
268
|
-
with SQLParserPatch():
|
|
269
|
-
# Code runs with patched SQLParser
|
|
270
|
-
pass
|
|
271
|
-
# Automatically unpatched on exit
|
|
272
|
-
|
|
273
|
-
# Or with explicit control
|
|
274
|
-
patcher = SQLParserPatch()
|
|
275
|
-
patcher.patch()
|
|
276
|
-
try:
|
|
277
|
-
# ... plugin lifetime ...
|
|
278
|
-
finally:
|
|
279
|
-
patcher.unpatch()
|
|
280
|
-
|
|
281
|
-
The patch stores the original SQLParser method and replaces it with DataHub's
|
|
282
|
-
enhanced implementation that provides column-level lineage support.
|
|
283
|
-
"""
|
|
284
|
-
|
|
285
|
-
def patch(self) -> "SQLParserPatch":
|
|
286
|
-
"""
|
|
287
|
-
Apply the SQLParser patch.
|
|
288
|
-
|
|
289
|
-
Stores the original SQLParser.generate_openlineage_metadata_from_sql method
|
|
290
|
-
and replaces it with DataHub's enhanced implementation.
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
self for method chaining
|
|
294
|
-
"""
|
|
295
|
-
global _original_sql_parser_method
|
|
296
|
-
|
|
297
|
-
try:
|
|
298
|
-
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
299
|
-
|
|
300
|
-
# Store original method for fallback (only if not already patched)
|
|
301
|
-
if _original_sql_parser_method is None:
|
|
302
|
-
_original_sql_parser_method = (
|
|
303
|
-
SQLParser.generate_openlineage_metadata_from_sql
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[assignment,method-assign]
|
|
307
|
-
_datahub_generate_openlineage_metadata_from_sql # type: ignore[assignment,method-assign]
|
|
308
|
-
)
|
|
309
|
-
logger.debug(
|
|
310
|
-
"Patched SQLParser.generate_openlineage_metadata_from_sql with DataHub SQL parser"
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
except ImportError:
|
|
314
|
-
# SQLParser not available (provider package not installed or Airflow < 2.10)
|
|
315
|
-
logger.debug(
|
|
316
|
-
"SQLParser not available, skipping patch (likely Airflow < 2.10 or provider package not installed)"
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
return self
|
|
320
|
-
|
|
321
|
-
def unpatch(self) -> "SQLParserPatch":
|
|
322
|
-
"""
|
|
323
|
-
Remove the SQLParser patch and restore the original method.
|
|
324
|
-
|
|
325
|
-
This is primarily useful for testing to ensure clean state between tests.
|
|
326
|
-
In production, the patch typically stays active for the process lifetime.
|
|
327
|
-
|
|
328
|
-
Returns:
|
|
329
|
-
self for method chaining
|
|
330
|
-
"""
|
|
331
|
-
global _original_sql_parser_method
|
|
332
|
-
|
|
333
|
-
if _original_sql_parser_method is None:
|
|
334
|
-
logger.debug("SQLParser not patched, nothing to unpatch")
|
|
335
|
-
return self
|
|
336
|
-
|
|
337
|
-
try:
|
|
338
|
-
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
339
|
-
|
|
340
|
-
# Restore original method
|
|
341
|
-
SQLParser.generate_openlineage_metadata_from_sql = ( # type: ignore[method-assign]
|
|
342
|
-
_original_sql_parser_method
|
|
343
|
-
)
|
|
344
|
-
logger.debug("Unpatched SQLParser, restored original method")
|
|
345
|
-
|
|
346
|
-
except ImportError:
|
|
347
|
-
logger.debug("SQLParser not available, nothing to unpatch")
|
|
348
|
-
finally:
|
|
349
|
-
# Clear the stored reference to allow re-patching
|
|
350
|
-
_original_sql_parser_method = None
|
|
351
|
-
|
|
352
|
-
return self
|
|
353
|
-
|
|
354
|
-
def __enter__(self) -> "SQLParserPatch":
|
|
355
|
-
"""Context manager entry: apply the patch."""
|
|
356
|
-
return self.patch()
|
|
357
|
-
|
|
358
|
-
def __exit__(
|
|
359
|
-
self,
|
|
360
|
-
exc_type: Optional[type[BaseException]],
|
|
361
|
-
exc_val: Optional[BaseException],
|
|
362
|
-
exc_tb: Optional[TracebackType],
|
|
363
|
-
) -> None:
|
|
364
|
-
"""Context manager exit: remove the patch."""
|
|
365
|
-
self.unpatch()
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
# Global patcher instance for backward compatibility
|
|
369
|
-
_global_patcher = SQLParserPatch()
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
def patch_sqlparser() -> None:
|
|
373
|
-
"""
|
|
374
|
-
Patch SQLParser.generate_openlineage_metadata_from_sql to use DataHub's SQL parser.
|
|
375
|
-
|
|
376
|
-
This is a convenience function that wraps SQLParserPatch.patch() for backward
|
|
377
|
-
compatibility with existing code.
|
|
378
|
-
|
|
379
|
-
This should be called early in the plugin initialization, before any SQL operators are used.
|
|
380
|
-
|
|
381
|
-
When both DataHub and OpenLineage plugins are enabled (disable_openlineage_plugin=False),
|
|
382
|
-
the patch calls BOTH parsers:
|
|
383
|
-
- OpenLineage's original parser provides inputs/outputs for OpenLineage plugin
|
|
384
|
-
- DataHub's enhanced parser (with column-level lineage) is stored in run_facets
|
|
385
|
-
for DataHub listener to extract
|
|
386
|
-
|
|
387
|
-
When only DataHub is enabled (disable_openlineage_plugin=True), only DataHub's
|
|
388
|
-
parser runs and provides both the OperatorLineage structure and the enhanced parsing.
|
|
389
|
-
"""
|
|
390
|
-
_global_patcher.patch()
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
def unpatch_sqlparser() -> None:
|
|
394
|
-
"""
|
|
395
|
-
Remove the SQLParser patch and restore the original method.
|
|
396
|
-
|
|
397
|
-
This is a convenience function that wraps SQLParserPatch.unpatch() for consistency.
|
|
398
|
-
|
|
399
|
-
This is primarily useful for testing to ensure clean state between tests.
|
|
400
|
-
In production, the patch typically stays active for the process lifetime.
|
|
401
|
-
"""
|
|
402
|
-
_global_patcher.unpatch()
|