acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ """
2
+ Patch for TeradataOperator to use DataHub's SQL parser.
3
+
4
+ TeradataOperator in Airflow 2.x provider mode uses DefaultExtractor which returns empty
5
+ OperatorLineage. This patch modifies get_openlineage_facets_on_complete() to use
6
+ DataHub's SQL parser, enabling lineage extraction.
7
+ """
8
+
9
+ import logging
10
+ from typing import TYPE_CHECKING, Any, Optional
11
+
12
+ import datahub.emitter.mce_builder as builder
13
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
14
+ from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
15
+
16
+ if TYPE_CHECKING:
17
+ from airflow.models.taskinstance import TaskInstance
18
+ from airflow.providers.openlineage.extractors import OperatorLineage
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _should_patch_teradata_operator(operator_class: Any) -> bool:
24
+ """Check if Teradata operator should be patched."""
25
+ if not hasattr(operator_class, "get_openlineage_facets_on_complete"):
26
+ openlineage_methods = [
27
+ m
28
+ for m in dir(operator_class)
29
+ if "openlineage" in m.lower() or "facet" in m.lower()
30
+ ]
31
+ logger.warning(
32
+ f"TeradataOperator.get_openlineage_facets_on_complete not found - "
33
+ f"skipping patch. Available OpenLineage-related methods: {openlineage_methods}"
34
+ )
35
+ return False
36
+ if hasattr(operator_class, "_datahub_openlineage_patched"):
37
+ logger.debug("TeradataOperator already patched for OpenLineage")
38
+ return False
39
+ return True
40
+
41
+
42
+ def _render_teradata_sql_templates(
43
+ sql: str, operator: Any, task_instance: "TaskInstance"
44
+ ) -> str:
45
+ """
46
+ Render Jinja templates in Teradata SQL if they exist.
47
+
48
+ Returns the rendered SQL, or original SQL if rendering fails.
49
+ """
50
+ if "{{" not in str(sql) and "{%" not in str(sql):
51
+ return sql
52
+
53
+ try:
54
+ # Get template context from task_instance
55
+ context: Any = {}
56
+ if hasattr(task_instance, "get_template_context"):
57
+ context = task_instance.get_template_context()
58
+ elif (
59
+ hasattr(task_instance, "task")
60
+ and task_instance.task is not None
61
+ and hasattr(task_instance.task, "get_template_context")
62
+ ):
63
+ context = task_instance.task.get_template_context()
64
+
65
+ # Try to render using the operator's render_template method
66
+ if hasattr(operator, "render_template") and context:
67
+ rendered_query = operator.render_template(sql, context)
68
+ else:
69
+ # Fallback: try to render using Jinja2 directly
70
+ from airflow.templates import SandboxedEnvironment
71
+
72
+ jinja_env = SandboxedEnvironment()
73
+ template = jinja_env.from_string(str(sql))
74
+ rendered_query = template.render(**context) # type: ignore[misc]
75
+
76
+ logger.debug(f"Rendered Teradata SQL templates: {rendered_query[:200]}")
77
+ return rendered_query
78
+ except Exception as e:
79
+ logger.warning(f"Failed to render Teradata SQL templates, using original: {e}")
80
+ return sql
81
+
82
+
83
+ def _enhance_teradata_lineage_with_sql_parsing(
84
+ operator_lineage: "OperatorLineage",
85
+ rendered_sql: str,
86
+ operator: Any,
87
+ ) -> "OperatorLineage":
88
+ """
89
+ Enhance OperatorLineage with DataHub SQL parsing results.
90
+
91
+ Modifies operator_lineage in place by adding SQL parsing result to run_facets.
92
+ """
93
+ # Check if SQL parsing result is already in run_facets (from SQLParser patch)
94
+ if DATAHUB_SQL_PARSING_RESULT_KEY not in operator_lineage.run_facets:
95
+ # SQLParser patch didn't add it - add it manually
96
+ try:
97
+ platform = "teradata"
98
+ # Teradata uses database.table naming, no separate schema
99
+ # Get database/schema from operator if available
100
+ default_database = operator.schema if hasattr(operator, "schema") else None
101
+
102
+ logger.debug(
103
+ f"Running DataHub SQL parser for Teradata (platform={platform}, "
104
+ f"default_db={default_database}): {rendered_sql[:200] if rendered_sql else 'None'}"
105
+ )
106
+
107
+ # Use DataHub's SQL parser
108
+ sql_parsing_result = create_lineage_sql_parsed_result(
109
+ query=rendered_sql,
110
+ platform=platform,
111
+ platform_instance=None,
112
+ env=builder.DEFAULT_ENV,
113
+ default_db=default_database,
114
+ default_schema=None,
115
+ )
116
+
117
+ # Store the SQL parsing result in run_facets for DataHub listener
118
+ if sql_parsing_result:
119
+ operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = (
120
+ sql_parsing_result
121
+ )
122
+ logger.debug(
123
+ f"Added DataHub SQL parsing result for Teradata with "
124
+ f"{len(sql_parsing_result.in_tables)} input tables, "
125
+ f"{len(sql_parsing_result.out_tables)} output tables, "
126
+ f"{len(sql_parsing_result.column_lineage or [])} column lineages"
127
+ )
128
+ except Exception as e:
129
+ logger.warning(
130
+ f"Error running DataHub SQL parser for Teradata: {e}",
131
+ exc_info=True,
132
+ )
133
+ else:
134
+ logger.debug(
135
+ f"DataHub SQL parsing result already present in run_facets "
136
+ f"(added by SQLParser patch) with "
137
+ f"{len(operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY].column_lineage or [])} column lineages"
138
+ )
139
+
140
+ return operator_lineage
141
+
142
+
143
+ def _create_teradata_openlineage_wrapper(
144
+ original_get_openlineage_facets_on_complete: Any,
145
+ ) -> Any:
146
+ """Create wrapper function for Teradata operator's OpenLineage method."""
147
+ # Import OperatorLineage at wrapper creation time to check availability
148
+ # This avoids runtime import errors that would cause the patch to return None
149
+ # Try multiple import paths for compatibility with different Airflow versions
150
+ # Airflow 3.x: from airflow.providers.openlineage.extractors
151
+ # Airflow 2.x provider: from airflow.providers.openlineage.extractors.base
152
+ OperatorLineageClass: Any = None
153
+ import_error = None
154
+ try:
155
+ # Try Airflow 3.x import path first
156
+ from airflow.providers.openlineage.extractors import (
157
+ OperatorLineage as OperatorLineageClass,
158
+ )
159
+ except (ImportError, ModuleNotFoundError) as e:
160
+ import_error = e
161
+ try:
162
+ # Fallback for Airflow 2.x provider mode compatibility
163
+ from airflow.providers.openlineage.extractors.base import (
164
+ OperatorLineage as OperatorLineageClass,
165
+ )
166
+
167
+ import_error = None # Success, clear the error
168
+ except (ImportError, ModuleNotFoundError) as e2:
169
+ # Both imports failed - log the more specific error
170
+ import_error = e2 if "Operator" not in str(e) else e
171
+
172
+ if OperatorLineageClass is None or import_error is not None:
173
+ # Log warning but don't fail - this is expected in some environments
174
+ error_msg = str(import_error) if import_error else "Unknown import error"
175
+ logger.warning(
176
+ f"Could not import OperatorLineage for Teradata patch: {error_msg}. "
177
+ "This may be due to OpenLineage provider compatibility issues. "
178
+ "Patch will not be applied."
179
+ )
180
+ # Return original function if import fails
181
+ return original_get_openlineage_facets_on_complete
182
+
183
+ def get_openlineage_facets_on_complete(
184
+ self: Any, task_instance: "TaskInstance"
185
+ ) -> Optional["OperatorLineage"]:
186
+ """
187
+ Enhanced version that uses DataHub's SQL parser for better lineage.
188
+
189
+ This method:
190
+ 1. Calls the original OpenLineage implementation
191
+ 2. Enhances it with DataHub SQL parsing result for column lineage
192
+ """
193
+ try:
194
+ # Get the SQL query from operator
195
+ sql = self.sql
196
+ if not sql:
197
+ logger.debug("No SQL query found in TeradataOperator")
198
+ return original_get_openlineage_facets_on_complete(self, task_instance)
199
+
200
+ # Handle list of SQL statements (TeradataOperator supports both str and list)
201
+ if isinstance(sql, list):
202
+ # Join multiple statements with semicolon
203
+ sql = ";\n".join(str(s) for s in sql)
204
+ else:
205
+ sql = str(sql)
206
+
207
+ logger.debug(
208
+ f"DataHub patched Teradata get_openlineage_facets_on_complete called for query: {sql[:100]}"
209
+ )
210
+
211
+ # Get the original OpenLineage result
212
+ operator_lineage = original_get_openlineage_facets_on_complete(
213
+ self, task_instance
214
+ )
215
+
216
+ # If original returns None (DefaultExtractor returns None),
217
+ # create a new OperatorLineage so we can still add SQL parsing result
218
+ if not operator_lineage:
219
+ logger.debug(
220
+ "Original OpenLineage returned None for TeradataOperator, "
221
+ "creating new OperatorLineage for SQL parsing"
222
+ )
223
+ # OperatorLineageClass is already imported at wrapper creation time
224
+ operator_lineage = OperatorLineageClass( # type: ignore[misc]
225
+ inputs=[],
226
+ outputs=[],
227
+ job_facets={},
228
+ run_facets={},
229
+ )
230
+
231
+ logger.debug(
232
+ f"Original Teradata OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
233
+ )
234
+
235
+ # Render SQL templates if needed
236
+ rendered_sql = _render_teradata_sql_templates(sql, self, task_instance)
237
+
238
+ # Enhance with SQL parsing
239
+ operator_lineage = _enhance_teradata_lineage_with_sql_parsing(
240
+ operator_lineage, rendered_sql, self
241
+ )
242
+
243
+ return operator_lineage
244
+
245
+ except Exception as e:
246
+ logger.warning(
247
+ f"Error in patched TeradataOperator.get_openlineage_facets_on_complete: {e}",
248
+ exc_info=True,
249
+ )
250
+ # Fall back to original method
251
+ return original_get_openlineage_facets_on_complete(self, task_instance)
252
+
253
+ return get_openlineage_facets_on_complete
254
+
255
+
256
+ def patch_teradata_operator() -> None:
257
+ """
258
+ Patch TeradataOperator to use DataHub's SQL parser for lineage extraction.
259
+
260
+ This enhances the existing OpenLineage support with DataHub's SQL parser,
261
+ which provides column-level lineage extraction for Teradata operators.
262
+ """
263
+ try:
264
+ logger.debug("Attempting to patch TeradataOperator for OpenLineage")
265
+ from airflow.providers.teradata.operators.teradata import TeradataOperator
266
+
267
+ logger.debug(f"Successfully imported TeradataOperator: {TeradataOperator}")
268
+
269
+ if not _should_patch_teradata_operator(TeradataOperator):
270
+ logger.warning(
271
+ "TeradataOperator patch check failed - patch will not be applied"
272
+ )
273
+ return
274
+ logger.debug("TeradataOperator patch check passed - proceeding with patch")
275
+
276
+ # Store original method
277
+ original_get_openlineage_facets_on_complete = (
278
+ TeradataOperator.get_openlineage_facets_on_complete
279
+ )
280
+
281
+ # Create wrapper function
282
+ wrapper = _create_teradata_openlineage_wrapper(
283
+ original_get_openlineage_facets_on_complete
284
+ )
285
+
286
+ # Check if wrapper creation failed (import error)
287
+ # If wrapper is the same as original, the import failed and we shouldn't apply the patch
288
+ if wrapper is original_get_openlineage_facets_on_complete:
289
+ logger.debug(
290
+ "TeradataOperator patch not applied - OperatorLineage import failed. "
291
+ "Falling back to original OpenLineage behavior."
292
+ )
293
+ return
294
+
295
+ # Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
296
+ TeradataOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
297
+ wrapper # type: ignore[assignment]
298
+ )
299
+ TeradataOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
300
+
301
+ logger.debug(
302
+ "Successfully patched TeradataOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
303
+ )
304
+
305
+ except ImportError as e:
306
+ logger.debug(
307
+ f"Could not patch TeradataOperator for OpenLineage (provider not installed): {e}"
308
+ )