acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,273 @@
1
+ """
2
+ Patch for BigQueryInsertJobOperator to use DataHub's SQL parser.
3
+
4
+ BigQueryInsertJobOperator in Airflow 3.x doesn't use the standard SQLParser approach
5
+ because it stores SQL in a configuration dictionary. This patch modifies
6
+ get_openlineage_facets_on_complete() to use DataHub's SQL parser, enabling
7
+ column-level lineage extraction.
8
+ """
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING, Any, Optional
12
+
13
+ import datahub.emitter.mce_builder as builder
14
+
15
+ if TYPE_CHECKING:
16
+ from airflow.models.taskinstance import TaskInstance
17
+ from airflow.providers.openlineage.extractors import OperatorLineage
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _should_patch_bigquery_operator(operator_class: Any) -> bool:
23
+ """Check if BigQuery operator should be patched."""
24
+ if not hasattr(operator_class, "get_openlineage_facets_on_complete"):
25
+ logger.debug(
26
+ "BigQueryInsertJobOperator.get_openlineage_facets_on_complete not found - "
27
+ "likely Airflow 2.x, skipping patch"
28
+ )
29
+ return False
30
+ if hasattr(operator_class, "_datahub_openlineage_patched"):
31
+ logger.debug("BigQueryInsertJobOperator already patched for OpenLineage")
32
+ return False
33
+ return True
34
+
35
+
36
+ def _render_bigquery_sql_templates(
37
+ sql: str, operator: Any, task_instance: "TaskInstance"
38
+ ) -> str:
39
+ """
40
+ Render Jinja templates in BigQuery SQL if they exist.
41
+
42
+ Returns the rendered SQL, or original SQL if rendering fails.
43
+ """
44
+ if "{{" not in str(sql) and "{%" not in str(sql):
45
+ return sql
46
+
47
+ try:
48
+ # Get template context from task_instance
49
+ context: Any = {}
50
+ if hasattr(task_instance, "get_template_context"):
51
+ context = task_instance.get_template_context()
52
+ elif (
53
+ hasattr(task_instance, "task")
54
+ and task_instance.task is not None
55
+ and hasattr(task_instance.task, "get_template_context")
56
+ ):
57
+ context = task_instance.task.get_template_context()
58
+
59
+ # Try to render using the operator's render_template method
60
+ if hasattr(operator, "render_template") and context:
61
+ rendered_sql = operator.render_template(sql, context)
62
+ else:
63
+ # Fallback: try to render using Jinja2 directly
64
+ from airflow.templates import SandboxedEnvironment
65
+
66
+ jinja_env = SandboxedEnvironment()
67
+ template = jinja_env.from_string(str(sql))
68
+ rendered_sql = template.render(**context) # type: ignore[misc]
69
+
70
+ logger.debug(
71
+ "Rendered BigQuery SQL templates: %s -> %s",
72
+ str(sql)[:100],
73
+ str(rendered_sql)[:100],
74
+ )
75
+ return rendered_sql
76
+ except Exception as e:
77
+ logger.warning(
78
+ "Failed to render BigQuery SQL templates, using original SQL: %s",
79
+ e,
80
+ )
81
+ return sql
82
+
83
+
84
+ def _enhance_bigquery_lineage_with_sql_parsing(
85
+ operator_lineage: "OperatorLineage",
86
+ rendered_sql: str,
87
+ operator: Any,
88
+ ) -> None:
89
+ """
90
+ Enhance OperatorLineage with DataHub SQL parsing results.
91
+
92
+ Modifies operator_lineage in place by adding SQL parsing result to run_facets.
93
+ """
94
+ try:
95
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
96
+ from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
97
+ from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
98
+
99
+ platform = "bigquery"
100
+ default_database = (
101
+ operator.project_id if hasattr(operator, "project_id") else None
102
+ )
103
+
104
+ logger.debug(
105
+ f"Running DataHub SQL parser for BigQuery (platform={platform}, "
106
+ f"default_db={default_database}): {rendered_sql}"
107
+ )
108
+
109
+ listener = get_airflow_plugin_listener()
110
+ graph = listener.graph if listener else None
111
+
112
+ # Use DataHub's SQL parser with rendered SQL
113
+ sql_parsing_result = create_lineage_sql_parsed_result(
114
+ query=rendered_sql,
115
+ graph=graph,
116
+ platform=platform,
117
+ platform_instance=None,
118
+ env=builder.DEFAULT_ENV,
119
+ default_db=default_database,
120
+ default_schema=None,
121
+ )
122
+
123
+ logger.debug(
124
+ f"DataHub SQL parsing result: in_tables={len(sql_parsing_result.in_tables)}, "
125
+ f"out_tables={len(sql_parsing_result.out_tables)}, "
126
+ f"column_lineage={len(sql_parsing_result.column_lineage or [])}"
127
+ )
128
+
129
+ # Check if there's a destinationTable in configuration
130
+ destination_table = operator.configuration.get("query", {}).get(
131
+ "destinationTable"
132
+ )
133
+ if destination_table:
134
+ project_id = destination_table.get("projectId")
135
+ dataset_id = destination_table.get("datasetId")
136
+ table_id = destination_table.get("tableId")
137
+
138
+ if project_id and dataset_id and table_id:
139
+ destination_table_urn = builder.make_dataset_urn(
140
+ platform="bigquery",
141
+ name=f"{project_id}.{dataset_id}.{table_id}",
142
+ env=builder.DEFAULT_ENV,
143
+ )
144
+ # Add to output tables if not already present
145
+ if destination_table_urn not in sql_parsing_result.out_tables:
146
+ sql_parsing_result.out_tables.append(destination_table_urn)
147
+ logger.debug(
148
+ f"Added destination table to outputs: {destination_table_urn}"
149
+ )
150
+
151
+ # Store the SQL parsing result in run_facets for DataHub listener
152
+ if sql_parsing_result:
153
+ operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = (
154
+ sql_parsing_result
155
+ )
156
+ logger.debug(
157
+ f"Added DataHub SQL parsing result with "
158
+ f"{len(sql_parsing_result.column_lineage or [])} column lineages"
159
+ )
160
+
161
+ except Exception as e:
162
+ logger.warning(
163
+ f"Error running DataHub SQL parser for BigQuery: {e}",
164
+ exc_info=True,
165
+ )
166
+
167
+
168
+ def _create_bigquery_openlineage_wrapper(
169
+ original_method: Any,
170
+ ) -> Any:
171
+ """Create the wrapper function for BigQuery OpenLineage extraction."""
172
+
173
+ def get_openlineage_facets_on_complete(
174
+ self: Any, task_instance: "TaskInstance"
175
+ ) -> Optional["OperatorLineage"]:
176
+ """
177
+ Enhanced version that uses DataHub's SQL parser for better lineage.
178
+
179
+ This method:
180
+ 1. Calls the original OpenLineage implementation
181
+ 2. Enhances it with DataHub SQL parsing result for column lineage
182
+ """
183
+ try:
184
+ # Extract SQL from configuration
185
+ sql = self.configuration.get("query", {}).get("query")
186
+ if not sql:
187
+ logger.debug(
188
+ "No query found in BigQueryInsertJobOperator configuration"
189
+ )
190
+ return original_method(self, task_instance)
191
+
192
+ # Render Jinja templates in SQL if they exist
193
+ rendered_sql = _render_bigquery_sql_templates(sql, self, task_instance)
194
+
195
+ logger.debug(
196
+ f"DataHub patched BigQuery get_openlineage_facets_on_complete called for query: {rendered_sql[:100]}..."
197
+ )
198
+
199
+ # Get the original OpenLineage result
200
+ operator_lineage = original_method(self, task_instance)
201
+
202
+ # If original returns None (no job_id found in test environment),
203
+ # create a new OperatorLineage so we can still add SQL parsing result
204
+ if not operator_lineage:
205
+ logger.debug(
206
+ "Original OpenLineage returned None for BigQueryInsertJobOperator, "
207
+ "creating new OperatorLineage for SQL parsing"
208
+ )
209
+ from airflow.providers.openlineage.extractors import OperatorLineage
210
+
211
+ operator_lineage = OperatorLineage( # type: ignore[misc]
212
+ inputs=[],
213
+ outputs=[],
214
+ job_facets={},
215
+ run_facets={},
216
+ )
217
+
218
+ logger.debug(
219
+ f"Original BigQuery OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
220
+ )
221
+
222
+ # Enhance with DataHub SQL parsing
223
+ _enhance_bigquery_lineage_with_sql_parsing(
224
+ operator_lineage, rendered_sql, self
225
+ )
226
+
227
+ return operator_lineage
228
+
229
+ except Exception as e:
230
+ logger.warning(
231
+ f"Error in patched BigQueryInsertJobOperator.get_openlineage_facets_on_complete: {e}",
232
+ exc_info=True,
233
+ )
234
+ # Fall back to original method
235
+ return original_method(self, task_instance)
236
+
237
+ return get_openlineage_facets_on_complete
238
+
239
+
240
+ def patch_bigquery_insert_job_operator() -> None:
241
+ """
242
+ Patch BigQueryInsertJobOperator to use DataHub's SQL parser for lineage extraction.
243
+
244
+ This enhances the existing OpenLineage support with DataHub's SQL parser,
245
+ which provides better column-level lineage.
246
+ """
247
+ try:
248
+ from airflow.providers.google.cloud.operators.bigquery import (
249
+ BigQueryInsertJobOperator,
250
+ )
251
+
252
+ # Check if operator should be patched
253
+ if not _should_patch_bigquery_operator(BigQueryInsertJobOperator):
254
+ return
255
+
256
+ # Store original method and create wrapper
257
+ original_method = BigQueryInsertJobOperator.get_openlineage_facets_on_complete
258
+ wrapper = _create_bigquery_openlineage_wrapper(original_method)
259
+
260
+ # Apply the patch
261
+ BigQueryInsertJobOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
262
+ wrapper # type: ignore[assignment]
263
+ )
264
+ BigQueryInsertJobOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
265
+
266
+ logger.debug(
267
+ "Patched BigQueryInsertJobOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
268
+ )
269
+
270
+ except ImportError as e:
271
+ logger.debug(
272
+ f"Could not patch BigQueryInsertJobOperator for OpenLineage (provider not installed or Airflow < 3.0): {e}"
273
+ )
@@ -0,0 +1,82 @@
1
+ """
2
+ Airflow 3.x specific shims and imports.
3
+ Clean, simple imports without cross-version compatibility complexity.
4
+ """
5
+
6
+ from typing import List, Union
7
+
8
+ from airflow.models.mappedoperator import MappedOperator
9
+
10
+ # Airflow 3.x SDK imports - these always exist in Airflow 3.x
11
+ from airflow.sdk.bases.operator import BaseOperator
12
+
13
+ # Operator type represents any operator (regular or mapped)
14
+ Operator = Union[BaseOperator, MappedOperator]
15
+
16
+ # ExternalTaskSensor import - uses standard provider in Airflow 3.x
17
+ try:
18
+ from airflow.providers.standard.sensors.external_task import ExternalTaskSensor
19
+ except ImportError:
20
+ # Fallback for earlier Airflow 3 versions
21
+ try:
22
+ from airflow.sensors.external_task import ExternalTaskSensor # type: ignore[no-redef] # noqa: I001
23
+ except ImportError:
24
+ from airflow.sensors.external_task_sensor import ( # type: ignore[no-redef]
25
+ ExternalTaskSensor,
26
+ )
27
+
28
+ # OpenLineage imports for Airflow 3.x (native provider)
29
+ try:
30
+ from airflow.providers.openlineage.plugins.openlineage import (
31
+ OpenLineageProviderPlugin as OpenLineagePlugin,
32
+ )
33
+ from airflow.providers.openlineage.utils.utils import (
34
+ get_operator_class,
35
+ try_import_from_string,
36
+ )
37
+
38
+ # Native provider doesn't need TaskHolder, use dict as placeholder
39
+ TaskHolder = dict # type: ignore
40
+
41
+ def redact_with_exclusions(source: dict) -> dict:
42
+ """Compatibility shim - native provider doesn't expose this."""
43
+ return source
44
+
45
+ except ImportError:
46
+ # Native provider not installed
47
+ TaskHolder = dict # type: ignore
48
+ OpenLineagePlugin = None # type: ignore
49
+ get_operator_class = None # type: ignore
50
+ try_import_from_string = None # type: ignore
51
+
52
+ def redact_with_exclusions(source: dict) -> dict:
53
+ return source
54
+
55
+
56
+ def get_task_inlets(operator: "Operator") -> List:
57
+ """Get task inlets for Airflow 3.x."""
58
+ if hasattr(operator, "get_inlet_defs"):
59
+ return operator.get_inlet_defs() # type: ignore[attr-defined]
60
+ return operator.inlets or []
61
+
62
+
63
+ def get_task_outlets(operator: "Operator") -> List:
64
+ """Get task outlets for Airflow 3.x."""
65
+ if hasattr(operator, "get_outlet_defs"):
66
+ return operator.get_outlet_defs()
67
+ return operator.outlets or []
68
+
69
+
70
+ __all__ = [
71
+ "BaseOperator",
72
+ "Operator",
73
+ "MappedOperator",
74
+ "ExternalTaskSensor",
75
+ "TaskHolder",
76
+ "OpenLineagePlugin",
77
+ "get_operator_class",
78
+ "try_import_from_string",
79
+ "redact_with_exclusions",
80
+ "get_task_inlets",
81
+ "get_task_outlets",
82
+ ]
@@ -0,0 +1,88 @@
1
+ """
2
+ Patch for SqliteHook to provide OpenLineage database info.
3
+
4
+ SqliteHook doesn't implement get_openlineage_database_info(), which causes
5
+ SQL lineage extraction to fail in Airflow 3.x. This patch adds the missing
6
+ implementation so that SQLite operators can properly extract lineage.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from typing import TYPE_CHECKING, Any, Optional
12
+
13
+ if TYPE_CHECKING:
14
+ from airflow.models.connection import Connection
15
+ from airflow.providers.openlineage.sqlparser import DatabaseInfo
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def patch_sqlite_hook() -> None:
21
+ """
22
+ Patch SqliteHook to provide OpenLineage database info.
23
+
24
+ This is necessary because SqliteHook doesn't override get_openlineage_database_info(),
25
+ causing it to return None and preventing SQL lineage extraction.
26
+ """
27
+ try:
28
+ from airflow.providers.openlineage.sqlparser import DatabaseInfo
29
+ from airflow.providers.sqlite.hooks.sqlite import SqliteHook
30
+
31
+ # Check if already patched
32
+ if hasattr(SqliteHook, "_datahub_openlineage_patched"):
33
+ logger.debug("SqliteHook already patched for OpenLineage")
34
+ return
35
+
36
+ def get_openlineage_database_info(
37
+ self: Any, connection: "Connection"
38
+ ) -> Optional["DatabaseInfo"]:
39
+ """
40
+ Return database info for SQLite connections.
41
+
42
+ For SQLite, the database name is derived from the connection's host field,
43
+ which contains the path to the SQLite database file.
44
+ """
45
+ # Get database path from connection
46
+ db_path = connection.host
47
+ if not db_path:
48
+ # Try to get from connection extra or schema
49
+ logger.debug("SQLite connection has no host (database path)")
50
+ return None
51
+
52
+ # Extract database name from file path
53
+ # For SQLite, we use the filename without extension as the database name
54
+ db_name = os.path.splitext(os.path.basename(db_path))[0]
55
+
56
+ logger.debug(
57
+ f"SQLite OpenLineage database info: path={db_path}, name={db_name}"
58
+ )
59
+
60
+ # Use connection type as scheme (e.g., "sqlite")
61
+ scheme = connection.conn_type or "sqlite"
62
+
63
+ # Create DatabaseInfo with SQLite-specific settings
64
+ return DatabaseInfo(
65
+ scheme=scheme,
66
+ authority=None, # SQLite doesn't have authority (host:port)
67
+ database=db_name,
68
+ # SQLite doesn't have information_schema, so these won't be used
69
+ information_schema_columns=[],
70
+ information_schema_table_name="",
71
+ use_flat_cross_db_query=False,
72
+ is_information_schema_cross_db=False,
73
+ is_uppercase_names=False,
74
+ normalize_name_method=lambda x: x.lower(), # SQLite is case-insensitive
75
+ )
76
+
77
+ # Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
78
+ SqliteHook.get_openlineage_database_info = get_openlineage_database_info # type: ignore[method-assign,attr-defined]
79
+ SqliteHook._datahub_openlineage_patched = True # type: ignore[attr-defined]
80
+
81
+ logger.debug(
82
+ "Patched SqliteHook.get_openlineage_database_info to provide database info for lineage extraction"
83
+ )
84
+
85
+ except ImportError as e:
86
+ logger.debug(
87
+ f"Could not patch SqliteHook for OpenLineage (likely Airflow < 3.0 or provider not installed): {e}"
88
+ )