acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Patch for TeradataOperator to use DataHub's SQL parser.
|
|
3
|
+
|
|
4
|
+
TeradataOperator in Airflow 2.x provider mode uses DefaultExtractor which returns empty
|
|
5
|
+
OperatorLineage. This patch modifies get_openlineage_facets_on_complete() to use
|
|
6
|
+
DataHub's SQL parser, enabling lineage extraction.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
11
|
+
|
|
12
|
+
import datahub.emitter.mce_builder as builder
|
|
13
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
14
|
+
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from airflow.models.taskinstance import TaskInstance
|
|
18
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _should_patch_teradata_operator(operator_class: Any) -> bool:
|
|
24
|
+
"""Check if Teradata operator should be patched."""
|
|
25
|
+
if not hasattr(operator_class, "get_openlineage_facets_on_complete"):
|
|
26
|
+
openlineage_methods = [
|
|
27
|
+
m
|
|
28
|
+
for m in dir(operator_class)
|
|
29
|
+
if "openlineage" in m.lower() or "facet" in m.lower()
|
|
30
|
+
]
|
|
31
|
+
logger.warning(
|
|
32
|
+
f"TeradataOperator.get_openlineage_facets_on_complete not found - "
|
|
33
|
+
f"skipping patch. Available OpenLineage-related methods: {openlineage_methods}"
|
|
34
|
+
)
|
|
35
|
+
return False
|
|
36
|
+
if hasattr(operator_class, "_datahub_openlineage_patched"):
|
|
37
|
+
logger.debug("TeradataOperator already patched for OpenLineage")
|
|
38
|
+
return False
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _render_teradata_sql_templates(
|
|
43
|
+
sql: str, operator: Any, task_instance: "TaskInstance"
|
|
44
|
+
) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Render Jinja templates in Teradata SQL if they exist.
|
|
47
|
+
|
|
48
|
+
Returns the rendered SQL, or original SQL if rendering fails.
|
|
49
|
+
"""
|
|
50
|
+
if "{{" not in str(sql) and "{%" not in str(sql):
|
|
51
|
+
return sql
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# Get template context from task_instance
|
|
55
|
+
context: Any = {}
|
|
56
|
+
if hasattr(task_instance, "get_template_context"):
|
|
57
|
+
context = task_instance.get_template_context()
|
|
58
|
+
elif (
|
|
59
|
+
hasattr(task_instance, "task")
|
|
60
|
+
and task_instance.task is not None
|
|
61
|
+
and hasattr(task_instance.task, "get_template_context")
|
|
62
|
+
):
|
|
63
|
+
context = task_instance.task.get_template_context()
|
|
64
|
+
|
|
65
|
+
# Try to render using the operator's render_template method
|
|
66
|
+
if hasattr(operator, "render_template") and context:
|
|
67
|
+
rendered_query = operator.render_template(sql, context)
|
|
68
|
+
else:
|
|
69
|
+
# Fallback: try to render using Jinja2 directly
|
|
70
|
+
from airflow.templates import SandboxedEnvironment
|
|
71
|
+
|
|
72
|
+
jinja_env = SandboxedEnvironment()
|
|
73
|
+
template = jinja_env.from_string(str(sql))
|
|
74
|
+
rendered_query = template.render(**context) # type: ignore[misc]
|
|
75
|
+
|
|
76
|
+
logger.debug(f"Rendered Teradata SQL templates: {rendered_query[:200]}")
|
|
77
|
+
return rendered_query
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"Failed to render Teradata SQL templates, using original: {e}")
|
|
80
|
+
return sql
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _enhance_teradata_lineage_with_sql_parsing(
|
|
84
|
+
operator_lineage: "OperatorLineage",
|
|
85
|
+
rendered_sql: str,
|
|
86
|
+
operator: Any,
|
|
87
|
+
) -> "OperatorLineage":
|
|
88
|
+
"""
|
|
89
|
+
Enhance OperatorLineage with DataHub SQL parsing results.
|
|
90
|
+
|
|
91
|
+
Modifies operator_lineage in place by adding SQL parsing result to run_facets.
|
|
92
|
+
"""
|
|
93
|
+
# Check if SQL parsing result is already in run_facets (from SQLParser patch)
|
|
94
|
+
if DATAHUB_SQL_PARSING_RESULT_KEY not in operator_lineage.run_facets:
|
|
95
|
+
# SQLParser patch didn't add it - add it manually
|
|
96
|
+
try:
|
|
97
|
+
platform = "teradata"
|
|
98
|
+
# Teradata uses database.table naming, no separate schema
|
|
99
|
+
# Get database/schema from operator if available
|
|
100
|
+
default_database = operator.schema if hasattr(operator, "schema") else None
|
|
101
|
+
|
|
102
|
+
logger.debug(
|
|
103
|
+
f"Running DataHub SQL parser for Teradata (platform={platform}, "
|
|
104
|
+
f"default_db={default_database}): {rendered_sql[:200] if rendered_sql else 'None'}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Use DataHub's SQL parser
|
|
108
|
+
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
109
|
+
query=rendered_sql,
|
|
110
|
+
platform=platform,
|
|
111
|
+
platform_instance=None,
|
|
112
|
+
env=builder.DEFAULT_ENV,
|
|
113
|
+
default_db=default_database,
|
|
114
|
+
default_schema=None,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Store the SQL parsing result in run_facets for DataHub listener
|
|
118
|
+
if sql_parsing_result:
|
|
119
|
+
operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = (
|
|
120
|
+
sql_parsing_result
|
|
121
|
+
)
|
|
122
|
+
logger.debug(
|
|
123
|
+
f"Added DataHub SQL parsing result for Teradata with "
|
|
124
|
+
f"{len(sql_parsing_result.in_tables)} input tables, "
|
|
125
|
+
f"{len(sql_parsing_result.out_tables)} output tables, "
|
|
126
|
+
f"{len(sql_parsing_result.column_lineage or [])} column lineages"
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(
|
|
130
|
+
f"Error running DataHub SQL parser for Teradata: {e}",
|
|
131
|
+
exc_info=True,
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
logger.debug(
|
|
135
|
+
f"DataHub SQL parsing result already present in run_facets "
|
|
136
|
+
f"(added by SQLParser patch) with "
|
|
137
|
+
f"{len(operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY].column_lineage or [])} column lineages"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return operator_lineage
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _create_teradata_openlineage_wrapper(
|
|
144
|
+
original_get_openlineage_facets_on_complete: Any,
|
|
145
|
+
) -> Any:
|
|
146
|
+
"""Create wrapper function for Teradata operator's OpenLineage method."""
|
|
147
|
+
# Import OperatorLineage at wrapper creation time to check availability
|
|
148
|
+
# This avoids runtime import errors that would cause the patch to return None
|
|
149
|
+
# Try multiple import paths for compatibility with different Airflow versions
|
|
150
|
+
# Airflow 3.x: from airflow.providers.openlineage.extractors
|
|
151
|
+
# Airflow 2.x provider: from airflow.providers.openlineage.extractors.base
|
|
152
|
+
OperatorLineageClass: Any = None
|
|
153
|
+
import_error = None
|
|
154
|
+
try:
|
|
155
|
+
# Try Airflow 3.x import path first
|
|
156
|
+
from airflow.providers.openlineage.extractors import (
|
|
157
|
+
OperatorLineage as OperatorLineageClass,
|
|
158
|
+
)
|
|
159
|
+
except (ImportError, ModuleNotFoundError) as e:
|
|
160
|
+
import_error = e
|
|
161
|
+
try:
|
|
162
|
+
# Fallback for Airflow 2.x provider mode compatibility
|
|
163
|
+
from airflow.providers.openlineage.extractors.base import (
|
|
164
|
+
OperatorLineage as OperatorLineageClass,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
import_error = None # Success, clear the error
|
|
168
|
+
except (ImportError, ModuleNotFoundError) as e2:
|
|
169
|
+
# Both imports failed - log the more specific error
|
|
170
|
+
import_error = e2 if "Operator" not in str(e) else e
|
|
171
|
+
|
|
172
|
+
if OperatorLineageClass is None or import_error is not None:
|
|
173
|
+
# Log warning but don't fail - this is expected in some environments
|
|
174
|
+
error_msg = str(import_error) if import_error else "Unknown import error"
|
|
175
|
+
logger.warning(
|
|
176
|
+
f"Could not import OperatorLineage for Teradata patch: {error_msg}. "
|
|
177
|
+
"This may be due to OpenLineage provider compatibility issues. "
|
|
178
|
+
"Patch will not be applied."
|
|
179
|
+
)
|
|
180
|
+
# Return original function if import fails
|
|
181
|
+
return original_get_openlineage_facets_on_complete
|
|
182
|
+
|
|
183
|
+
def get_openlineage_facets_on_complete(
|
|
184
|
+
self: Any, task_instance: "TaskInstance"
|
|
185
|
+
) -> Optional["OperatorLineage"]:
|
|
186
|
+
"""
|
|
187
|
+
Enhanced version that uses DataHub's SQL parser for better lineage.
|
|
188
|
+
|
|
189
|
+
This method:
|
|
190
|
+
1. Calls the original OpenLineage implementation
|
|
191
|
+
2. Enhances it with DataHub SQL parsing result for column lineage
|
|
192
|
+
"""
|
|
193
|
+
try:
|
|
194
|
+
# Get the SQL query from operator
|
|
195
|
+
sql = self.sql
|
|
196
|
+
if not sql:
|
|
197
|
+
logger.debug("No SQL query found in TeradataOperator")
|
|
198
|
+
return original_get_openlineage_facets_on_complete(self, task_instance)
|
|
199
|
+
|
|
200
|
+
# Handle list of SQL statements (TeradataOperator supports both str and list)
|
|
201
|
+
if isinstance(sql, list):
|
|
202
|
+
# Join multiple statements with semicolon
|
|
203
|
+
sql = ";\n".join(str(s) for s in sql)
|
|
204
|
+
else:
|
|
205
|
+
sql = str(sql)
|
|
206
|
+
|
|
207
|
+
logger.debug(
|
|
208
|
+
f"DataHub patched Teradata get_openlineage_facets_on_complete called for query: {sql[:100]}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Get the original OpenLineage result
|
|
212
|
+
operator_lineage = original_get_openlineage_facets_on_complete(
|
|
213
|
+
self, task_instance
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# If original returns None (DefaultExtractor returns None),
|
|
217
|
+
# create a new OperatorLineage so we can still add SQL parsing result
|
|
218
|
+
if not operator_lineage:
|
|
219
|
+
logger.debug(
|
|
220
|
+
"Original OpenLineage returned None for TeradataOperator, "
|
|
221
|
+
"creating new OperatorLineage for SQL parsing"
|
|
222
|
+
)
|
|
223
|
+
# OperatorLineageClass is already imported at wrapper creation time
|
|
224
|
+
operator_lineage = OperatorLineageClass( # type: ignore[misc]
|
|
225
|
+
inputs=[],
|
|
226
|
+
outputs=[],
|
|
227
|
+
job_facets={},
|
|
228
|
+
run_facets={},
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
logger.debug(
|
|
232
|
+
f"Original Teradata OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Render SQL templates if needed
|
|
236
|
+
rendered_sql = _render_teradata_sql_templates(sql, self, task_instance)
|
|
237
|
+
|
|
238
|
+
# Enhance with SQL parsing
|
|
239
|
+
operator_lineage = _enhance_teradata_lineage_with_sql_parsing(
|
|
240
|
+
operator_lineage, rendered_sql, self
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return operator_lineage
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.warning(
|
|
247
|
+
f"Error in patched TeradataOperator.get_openlineage_facets_on_complete: {e}",
|
|
248
|
+
exc_info=True,
|
|
249
|
+
)
|
|
250
|
+
# Fall back to original method
|
|
251
|
+
return original_get_openlineage_facets_on_complete(self, task_instance)
|
|
252
|
+
|
|
253
|
+
return get_openlineage_facets_on_complete
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def patch_teradata_operator() -> None:
|
|
257
|
+
"""
|
|
258
|
+
Patch TeradataOperator to use DataHub's SQL parser for lineage extraction.
|
|
259
|
+
|
|
260
|
+
This enhances the existing OpenLineage support with DataHub's SQL parser,
|
|
261
|
+
which provides column-level lineage extraction for Teradata operators.
|
|
262
|
+
"""
|
|
263
|
+
try:
|
|
264
|
+
logger.debug("Attempting to patch TeradataOperator for OpenLineage")
|
|
265
|
+
from airflow.providers.teradata.operators.teradata import TeradataOperator
|
|
266
|
+
|
|
267
|
+
logger.debug(f"Successfully imported TeradataOperator: {TeradataOperator}")
|
|
268
|
+
|
|
269
|
+
if not _should_patch_teradata_operator(TeradataOperator):
|
|
270
|
+
logger.warning(
|
|
271
|
+
"TeradataOperator patch check failed - patch will not be applied"
|
|
272
|
+
)
|
|
273
|
+
return
|
|
274
|
+
logger.debug("TeradataOperator patch check passed - proceeding with patch")
|
|
275
|
+
|
|
276
|
+
# Store original method
|
|
277
|
+
original_get_openlineage_facets_on_complete = (
|
|
278
|
+
TeradataOperator.get_openlineage_facets_on_complete
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Create wrapper function
|
|
282
|
+
wrapper = _create_teradata_openlineage_wrapper(
|
|
283
|
+
original_get_openlineage_facets_on_complete
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Check if wrapper creation failed (import error)
|
|
287
|
+
# If wrapper is the same as original, the import failed and we shouldn't apply the patch
|
|
288
|
+
if wrapper is original_get_openlineage_facets_on_complete:
|
|
289
|
+
logger.debug(
|
|
290
|
+
"TeradataOperator patch not applied - OperatorLineage import failed. "
|
|
291
|
+
"Falling back to original OpenLineage behavior."
|
|
292
|
+
)
|
|
293
|
+
return
|
|
294
|
+
|
|
295
|
+
# Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
|
|
296
|
+
TeradataOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
|
|
297
|
+
wrapper # type: ignore[assignment]
|
|
298
|
+
)
|
|
299
|
+
TeradataOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
|
|
300
|
+
|
|
301
|
+
logger.debug(
|
|
302
|
+
"Successfully patched TeradataOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
except ImportError as e:
|
|
306
|
+
logger.debug(
|
|
307
|
+
f"Could not patch TeradataOperator for OpenLineage (provider not installed): {e}"
|
|
308
|
+
)
|