acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Patch for BigQueryInsertJobOperator to use DataHub's SQL parser.
|
|
3
|
+
|
|
4
|
+
BigQueryInsertJobOperator in Airflow 3.x doesn't use the standard SQLParser approach
|
|
5
|
+
because it stores SQL in a configuration dictionary. This patch modifies
|
|
6
|
+
get_openlineage_facets_on_complete() to use DataHub's SQL parser, enabling
|
|
7
|
+
column-level lineage extraction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
12
|
+
|
|
13
|
+
import datahub.emitter.mce_builder as builder
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from airflow.models.taskinstance import TaskInstance
|
|
17
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _should_patch_bigquery_operator(operator_class: Any) -> bool:
|
|
23
|
+
"""Check if BigQuery operator should be patched."""
|
|
24
|
+
if not hasattr(operator_class, "get_openlineage_facets_on_complete"):
|
|
25
|
+
logger.debug(
|
|
26
|
+
"BigQueryInsertJobOperator.get_openlineage_facets_on_complete not found - "
|
|
27
|
+
"likely Airflow 2.x, skipping patch"
|
|
28
|
+
)
|
|
29
|
+
return False
|
|
30
|
+
if hasattr(operator_class, "_datahub_openlineage_patched"):
|
|
31
|
+
logger.debug("BigQueryInsertJobOperator already patched for OpenLineage")
|
|
32
|
+
return False
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _render_bigquery_sql_templates(
|
|
37
|
+
sql: str, operator: Any, task_instance: "TaskInstance"
|
|
38
|
+
) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Render Jinja templates in BigQuery SQL if they exist.
|
|
41
|
+
|
|
42
|
+
Returns the rendered SQL, or original SQL if rendering fails.
|
|
43
|
+
"""
|
|
44
|
+
if "{{" not in str(sql) and "{%" not in str(sql):
|
|
45
|
+
return sql
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Get template context from task_instance
|
|
49
|
+
context: Any = {}
|
|
50
|
+
if hasattr(task_instance, "get_template_context"):
|
|
51
|
+
context = task_instance.get_template_context()
|
|
52
|
+
elif (
|
|
53
|
+
hasattr(task_instance, "task")
|
|
54
|
+
and task_instance.task is not None
|
|
55
|
+
and hasattr(task_instance.task, "get_template_context")
|
|
56
|
+
):
|
|
57
|
+
context = task_instance.task.get_template_context()
|
|
58
|
+
|
|
59
|
+
# Try to render using the operator's render_template method
|
|
60
|
+
if hasattr(operator, "render_template") and context:
|
|
61
|
+
rendered_sql = operator.render_template(sql, context)
|
|
62
|
+
else:
|
|
63
|
+
# Fallback: try to render using Jinja2 directly
|
|
64
|
+
from airflow.templates import SandboxedEnvironment
|
|
65
|
+
|
|
66
|
+
jinja_env = SandboxedEnvironment()
|
|
67
|
+
template = jinja_env.from_string(str(sql))
|
|
68
|
+
rendered_sql = template.render(**context) # type: ignore[misc]
|
|
69
|
+
|
|
70
|
+
logger.debug(
|
|
71
|
+
"Rendered BigQuery SQL templates: %s -> %s",
|
|
72
|
+
str(sql)[:100],
|
|
73
|
+
str(rendered_sql)[:100],
|
|
74
|
+
)
|
|
75
|
+
return rendered_sql
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.warning(
|
|
78
|
+
"Failed to render BigQuery SQL templates, using original SQL: %s",
|
|
79
|
+
e,
|
|
80
|
+
)
|
|
81
|
+
return sql
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _enhance_bigquery_lineage_with_sql_parsing(
|
|
85
|
+
operator_lineage: "OperatorLineage",
|
|
86
|
+
rendered_sql: str,
|
|
87
|
+
operator: Any,
|
|
88
|
+
) -> None:
|
|
89
|
+
"""
|
|
90
|
+
Enhance OperatorLineage with DataHub SQL parsing results.
|
|
91
|
+
|
|
92
|
+
Modifies operator_lineage in place by adding SQL parsing result to run_facets.
|
|
93
|
+
"""
|
|
94
|
+
try:
|
|
95
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
96
|
+
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
97
|
+
from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener
|
|
98
|
+
|
|
99
|
+
platform = "bigquery"
|
|
100
|
+
default_database = (
|
|
101
|
+
operator.project_id if hasattr(operator, "project_id") else None
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
logger.debug(
|
|
105
|
+
f"Running DataHub SQL parser for BigQuery (platform={platform}, "
|
|
106
|
+
f"default_db={default_database}): {rendered_sql}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
listener = get_airflow_plugin_listener()
|
|
110
|
+
graph = listener.graph if listener else None
|
|
111
|
+
|
|
112
|
+
# Use DataHub's SQL parser with rendered SQL
|
|
113
|
+
sql_parsing_result = create_lineage_sql_parsed_result(
|
|
114
|
+
query=rendered_sql,
|
|
115
|
+
graph=graph,
|
|
116
|
+
platform=platform,
|
|
117
|
+
platform_instance=None,
|
|
118
|
+
env=builder.DEFAULT_ENV,
|
|
119
|
+
default_db=default_database,
|
|
120
|
+
default_schema=None,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
logger.debug(
|
|
124
|
+
f"DataHub SQL parsing result: in_tables={len(sql_parsing_result.in_tables)}, "
|
|
125
|
+
f"out_tables={len(sql_parsing_result.out_tables)}, "
|
|
126
|
+
f"column_lineage={len(sql_parsing_result.column_lineage or [])}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Check if there's a destinationTable in configuration
|
|
130
|
+
destination_table = operator.configuration.get("query", {}).get(
|
|
131
|
+
"destinationTable"
|
|
132
|
+
)
|
|
133
|
+
if destination_table:
|
|
134
|
+
project_id = destination_table.get("projectId")
|
|
135
|
+
dataset_id = destination_table.get("datasetId")
|
|
136
|
+
table_id = destination_table.get("tableId")
|
|
137
|
+
|
|
138
|
+
if project_id and dataset_id and table_id:
|
|
139
|
+
destination_table_urn = builder.make_dataset_urn(
|
|
140
|
+
platform="bigquery",
|
|
141
|
+
name=f"{project_id}.{dataset_id}.{table_id}",
|
|
142
|
+
env=builder.DEFAULT_ENV,
|
|
143
|
+
)
|
|
144
|
+
# Add to output tables if not already present
|
|
145
|
+
if destination_table_urn not in sql_parsing_result.out_tables:
|
|
146
|
+
sql_parsing_result.out_tables.append(destination_table_urn)
|
|
147
|
+
logger.debug(
|
|
148
|
+
f"Added destination table to outputs: {destination_table_urn}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Store the SQL parsing result in run_facets for DataHub listener
|
|
152
|
+
if sql_parsing_result:
|
|
153
|
+
operator_lineage.run_facets[DATAHUB_SQL_PARSING_RESULT_KEY] = (
|
|
154
|
+
sql_parsing_result
|
|
155
|
+
)
|
|
156
|
+
logger.debug(
|
|
157
|
+
f"Added DataHub SQL parsing result with "
|
|
158
|
+
f"{len(sql_parsing_result.column_lineage or [])} column lineages"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.warning(
|
|
163
|
+
f"Error running DataHub SQL parser for BigQuery: {e}",
|
|
164
|
+
exc_info=True,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _create_bigquery_openlineage_wrapper(
|
|
169
|
+
original_method: Any,
|
|
170
|
+
) -> Any:
|
|
171
|
+
"""Create the wrapper function for BigQuery OpenLineage extraction."""
|
|
172
|
+
|
|
173
|
+
def get_openlineage_facets_on_complete(
|
|
174
|
+
self: Any, task_instance: "TaskInstance"
|
|
175
|
+
) -> Optional["OperatorLineage"]:
|
|
176
|
+
"""
|
|
177
|
+
Enhanced version that uses DataHub's SQL parser for better lineage.
|
|
178
|
+
|
|
179
|
+
This method:
|
|
180
|
+
1. Calls the original OpenLineage implementation
|
|
181
|
+
2. Enhances it with DataHub SQL parsing result for column lineage
|
|
182
|
+
"""
|
|
183
|
+
try:
|
|
184
|
+
# Extract SQL from configuration
|
|
185
|
+
sql = self.configuration.get("query", {}).get("query")
|
|
186
|
+
if not sql:
|
|
187
|
+
logger.debug(
|
|
188
|
+
"No query found in BigQueryInsertJobOperator configuration"
|
|
189
|
+
)
|
|
190
|
+
return original_method(self, task_instance)
|
|
191
|
+
|
|
192
|
+
# Render Jinja templates in SQL if they exist
|
|
193
|
+
rendered_sql = _render_bigquery_sql_templates(sql, self, task_instance)
|
|
194
|
+
|
|
195
|
+
logger.debug(
|
|
196
|
+
f"DataHub patched BigQuery get_openlineage_facets_on_complete called for query: {rendered_sql[:100]}..."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Get the original OpenLineage result
|
|
200
|
+
operator_lineage = original_method(self, task_instance)
|
|
201
|
+
|
|
202
|
+
# If original returns None (no job_id found in test environment),
|
|
203
|
+
# create a new OperatorLineage so we can still add SQL parsing result
|
|
204
|
+
if not operator_lineage:
|
|
205
|
+
logger.debug(
|
|
206
|
+
"Original OpenLineage returned None for BigQueryInsertJobOperator, "
|
|
207
|
+
"creating new OperatorLineage for SQL parsing"
|
|
208
|
+
)
|
|
209
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
210
|
+
|
|
211
|
+
operator_lineage = OperatorLineage( # type: ignore[misc]
|
|
212
|
+
inputs=[],
|
|
213
|
+
outputs=[],
|
|
214
|
+
job_facets={},
|
|
215
|
+
run_facets={},
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
logger.debug(
|
|
219
|
+
f"Original BigQuery OpenLineage result: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Enhance with DataHub SQL parsing
|
|
223
|
+
_enhance_bigquery_lineage_with_sql_parsing(
|
|
224
|
+
operator_lineage, rendered_sql, self
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return operator_lineage
|
|
228
|
+
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.warning(
|
|
231
|
+
f"Error in patched BigQueryInsertJobOperator.get_openlineage_facets_on_complete: {e}",
|
|
232
|
+
exc_info=True,
|
|
233
|
+
)
|
|
234
|
+
# Fall back to original method
|
|
235
|
+
return original_method(self, task_instance)
|
|
236
|
+
|
|
237
|
+
return get_openlineage_facets_on_complete
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def patch_bigquery_insert_job_operator() -> None:
|
|
241
|
+
"""
|
|
242
|
+
Patch BigQueryInsertJobOperator to use DataHub's SQL parser for lineage extraction.
|
|
243
|
+
|
|
244
|
+
This enhances the existing OpenLineage support with DataHub's SQL parser,
|
|
245
|
+
which provides better column-level lineage.
|
|
246
|
+
"""
|
|
247
|
+
try:
|
|
248
|
+
from airflow.providers.google.cloud.operators.bigquery import (
|
|
249
|
+
BigQueryInsertJobOperator,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Check if operator should be patched
|
|
253
|
+
if not _should_patch_bigquery_operator(BigQueryInsertJobOperator):
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
# Store original method and create wrapper
|
|
257
|
+
original_method = BigQueryInsertJobOperator.get_openlineage_facets_on_complete
|
|
258
|
+
wrapper = _create_bigquery_openlineage_wrapper(original_method)
|
|
259
|
+
|
|
260
|
+
# Apply the patch
|
|
261
|
+
BigQueryInsertJobOperator.get_openlineage_facets_on_complete = ( # type: ignore[assignment,method-assign]
|
|
262
|
+
wrapper # type: ignore[assignment]
|
|
263
|
+
)
|
|
264
|
+
BigQueryInsertJobOperator._datahub_openlineage_patched = True # type: ignore[attr-defined]
|
|
265
|
+
|
|
266
|
+
logger.debug(
|
|
267
|
+
"Patched BigQueryInsertJobOperator.get_openlineage_facets_on_complete to use DataHub SQL parser"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
except ImportError as e:
|
|
271
|
+
logger.debug(
|
|
272
|
+
f"Could not patch BigQueryInsertJobOperator for OpenLineage (provider not installed or Airflow < 3.0): {e}"
|
|
273
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Airflow 3.x specific shims and imports.
|
|
3
|
+
Clean, simple imports without cross-version compatibility complexity.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Union
|
|
7
|
+
|
|
8
|
+
from airflow.models.mappedoperator import MappedOperator
|
|
9
|
+
|
|
10
|
+
# Airflow 3.x SDK imports - these always exist in Airflow 3.x
|
|
11
|
+
from airflow.sdk.bases.operator import BaseOperator
|
|
12
|
+
|
|
13
|
+
# Operator type represents any operator (regular or mapped)
|
|
14
|
+
Operator = Union[BaseOperator, MappedOperator]
|
|
15
|
+
|
|
16
|
+
# ExternalTaskSensor import - uses standard provider in Airflow 3.x
|
|
17
|
+
try:
|
|
18
|
+
from airflow.providers.standard.sensors.external_task import ExternalTaskSensor
|
|
19
|
+
except ImportError:
|
|
20
|
+
# Fallback for earlier Airflow 3 versions
|
|
21
|
+
try:
|
|
22
|
+
from airflow.sensors.external_task import ExternalTaskSensor # type: ignore[no-redef] # noqa: I001
|
|
23
|
+
except ImportError:
|
|
24
|
+
from airflow.sensors.external_task_sensor import ( # type: ignore[no-redef]
|
|
25
|
+
ExternalTaskSensor,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# OpenLineage imports for Airflow 3.x (native provider)
|
|
29
|
+
try:
|
|
30
|
+
from airflow.providers.openlineage.plugins.openlineage import (
|
|
31
|
+
OpenLineageProviderPlugin as OpenLineagePlugin,
|
|
32
|
+
)
|
|
33
|
+
from airflow.providers.openlineage.utils.utils import (
|
|
34
|
+
get_operator_class,
|
|
35
|
+
try_import_from_string,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Native provider doesn't need TaskHolder, use dict as placeholder
|
|
39
|
+
TaskHolder = dict # type: ignore
|
|
40
|
+
|
|
41
|
+
def redact_with_exclusions(source: dict) -> dict:
|
|
42
|
+
"""Compatibility shim - native provider doesn't expose this."""
|
|
43
|
+
return source
|
|
44
|
+
|
|
45
|
+
except ImportError:
|
|
46
|
+
# Native provider not installed
|
|
47
|
+
TaskHolder = dict # type: ignore
|
|
48
|
+
OpenLineagePlugin = None # type: ignore
|
|
49
|
+
get_operator_class = None # type: ignore
|
|
50
|
+
try_import_from_string = None # type: ignore
|
|
51
|
+
|
|
52
|
+
def redact_with_exclusions(source: dict) -> dict:
|
|
53
|
+
return source
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_task_inlets(operator: "Operator") -> List:
|
|
57
|
+
"""Get task inlets for Airflow 3.x."""
|
|
58
|
+
if hasattr(operator, "get_inlet_defs"):
|
|
59
|
+
return operator.get_inlet_defs() # type: ignore[attr-defined]
|
|
60
|
+
return operator.inlets or []
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_task_outlets(operator: "Operator") -> List:
|
|
64
|
+
"""Get task outlets for Airflow 3.x."""
|
|
65
|
+
if hasattr(operator, "get_outlet_defs"):
|
|
66
|
+
return operator.get_outlet_defs()
|
|
67
|
+
return operator.outlets or []
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
__all__ = [
|
|
71
|
+
"BaseOperator",
|
|
72
|
+
"Operator",
|
|
73
|
+
"MappedOperator",
|
|
74
|
+
"ExternalTaskSensor",
|
|
75
|
+
"TaskHolder",
|
|
76
|
+
"OpenLineagePlugin",
|
|
77
|
+
"get_operator_class",
|
|
78
|
+
"try_import_from_string",
|
|
79
|
+
"redact_with_exclusions",
|
|
80
|
+
"get_task_inlets",
|
|
81
|
+
"get_task_outlets",
|
|
82
|
+
]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Patch for SqliteHook to provide OpenLineage database info.
|
|
3
|
+
|
|
4
|
+
SqliteHook doesn't implement get_openlineage_database_info(), which causes
|
|
5
|
+
SQL lineage extraction to fail in Airflow 3.x. This patch adds the missing
|
|
6
|
+
implementation so that SQLite operators can properly extract lineage.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from airflow.models.connection import Connection
|
|
15
|
+
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def patch_sqlite_hook() -> None:
|
|
21
|
+
"""
|
|
22
|
+
Patch SqliteHook to provide OpenLineage database info.
|
|
23
|
+
|
|
24
|
+
This is necessary because SqliteHook doesn't override get_openlineage_database_info(),
|
|
25
|
+
causing it to return None and preventing SQL lineage extraction.
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
from airflow.providers.openlineage.sqlparser import DatabaseInfo
|
|
29
|
+
from airflow.providers.sqlite.hooks.sqlite import SqliteHook
|
|
30
|
+
|
|
31
|
+
# Check if already patched
|
|
32
|
+
if hasattr(SqliteHook, "_datahub_openlineage_patched"):
|
|
33
|
+
logger.debug("SqliteHook already patched for OpenLineage")
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
def get_openlineage_database_info(
|
|
37
|
+
self: Any, connection: "Connection"
|
|
38
|
+
) -> Optional["DatabaseInfo"]:
|
|
39
|
+
"""
|
|
40
|
+
Return database info for SQLite connections.
|
|
41
|
+
|
|
42
|
+
For SQLite, the database name is derived from the connection's host field,
|
|
43
|
+
which contains the path to the SQLite database file.
|
|
44
|
+
"""
|
|
45
|
+
# Get database path from connection
|
|
46
|
+
db_path = connection.host
|
|
47
|
+
if not db_path:
|
|
48
|
+
# Try to get from connection extra or schema
|
|
49
|
+
logger.debug("SQLite connection has no host (database path)")
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
# Extract database name from file path
|
|
53
|
+
# For SQLite, we use the filename without extension as the database name
|
|
54
|
+
db_name = os.path.splitext(os.path.basename(db_path))[0]
|
|
55
|
+
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"SQLite OpenLineage database info: path={db_path}, name={db_name}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Use connection type as scheme (e.g., "sqlite")
|
|
61
|
+
scheme = connection.conn_type or "sqlite"
|
|
62
|
+
|
|
63
|
+
# Create DatabaseInfo with SQLite-specific settings
|
|
64
|
+
return DatabaseInfo(
|
|
65
|
+
scheme=scheme,
|
|
66
|
+
authority=None, # SQLite doesn't have authority (host:port)
|
|
67
|
+
database=db_name,
|
|
68
|
+
# SQLite doesn't have information_schema, so these won't be used
|
|
69
|
+
information_schema_columns=[],
|
|
70
|
+
information_schema_table_name="",
|
|
71
|
+
use_flat_cross_db_query=False,
|
|
72
|
+
is_information_schema_cross_db=False,
|
|
73
|
+
is_uppercase_names=False,
|
|
74
|
+
normalize_name_method=lambda x: x.lower(), # SQLite is case-insensitive
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Apply the patch (mypy doesn't like dynamic method assignment, but it's necessary for patching)
|
|
78
|
+
SqliteHook.get_openlineage_database_info = get_openlineage_database_info # type: ignore[method-assign,attr-defined]
|
|
79
|
+
SqliteHook._datahub_openlineage_patched = True # type: ignore[attr-defined]
|
|
80
|
+
|
|
81
|
+
logger.debug(
|
|
82
|
+
"Patched SqliteHook.get_openlineage_database_info to provide database info for lineage extraction"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
except ImportError as e:
|
|
86
|
+
logger.debug(
|
|
87
|
+
f"Could not patch SqliteHook for OpenLineage (likely Airflow < 3.0 or provider not installed): {e}"
|
|
88
|
+
)
|