acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
- datahub_airflow_plugin/_airflow_shims.py +31 -64
- datahub_airflow_plugin/_config.py +19 -97
- datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
- datahub_airflow_plugin/_extractors.py +365 -0
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/client/airflow_generator.py +43 -147
- datahub_airflow_plugin/datahub_listener.py +790 -19
- datahub_airflow_plugin/example_dags/__init__.py +0 -32
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
- datahub_airflow_plugin/hooks/datahub.py +2 -11
- datahub_airflow_plugin/operators/datahub.py +3 -20
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
- datahub_airflow_plugin/_airflow_compat.py +0 -32
- datahub_airflow_plugin/_airflow_version_specific.py +0 -184
- datahub_airflow_plugin/_constants.py +0 -16
- datahub_airflow_plugin/airflow2/__init__.py +0 -6
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
- datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
- datahub_airflow_plugin/airflow2/_extractors.py +0 -477
- datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
- datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
- datahub_airflow_plugin/airflow2/_shims.py +0 -88
- datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
- datahub_airflow_plugin/airflow3/__init__.py +0 -6
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
- datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
- datahub_airflow_plugin/airflow3/_shims.py +0 -82
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
- datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,1452 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import functools
|
|
3
|
-
import logging
|
|
4
|
-
import os
|
|
5
|
-
import threading
|
|
6
|
-
import time
|
|
7
|
-
from typing import (
|
|
8
|
-
TYPE_CHECKING,
|
|
9
|
-
Any,
|
|
10
|
-
Callable,
|
|
11
|
-
Dict,
|
|
12
|
-
List,
|
|
13
|
-
Optional,
|
|
14
|
-
Tuple,
|
|
15
|
-
TypeVar,
|
|
16
|
-
cast,
|
|
17
|
-
)
|
|
18
|
-
from urllib.parse import urlparse, urlunparse
|
|
19
|
-
|
|
20
|
-
import airflow
|
|
21
|
-
from airflow.configuration import conf
|
|
22
|
-
from airflow.models.serialized_dag import SerializedDagModel
|
|
23
|
-
from airflow.sdk import Connection
|
|
24
|
-
from openlineage.client.serde import Serde
|
|
25
|
-
|
|
26
|
-
import datahub.emitter.mce_builder as builder
|
|
27
|
-
from datahub.api.entities.datajob import DataJob
|
|
28
|
-
from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult
|
|
29
|
-
from datahub.emitter.composite_emitter import CompositeEmitter
|
|
30
|
-
from datahub.emitter.generic_emitter import Emitter
|
|
31
|
-
from datahub.emitter.mce_builder import (
|
|
32
|
-
make_data_platform_urn,
|
|
33
|
-
make_dataplatform_instance_urn,
|
|
34
|
-
)
|
|
35
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
36
|
-
from datahub.ingestion.graph.client import DataHubGraph
|
|
37
|
-
from datahub.metadata.schema_classes import (
|
|
38
|
-
BrowsePathEntryClass,
|
|
39
|
-
BrowsePathsV2Class,
|
|
40
|
-
DataFlowKeyClass,
|
|
41
|
-
DataJobInputOutputClass,
|
|
42
|
-
DataJobKeyClass,
|
|
43
|
-
DataPlatformInstanceClass,
|
|
44
|
-
FineGrainedLineageClass,
|
|
45
|
-
FineGrainedLineageDownstreamTypeClass,
|
|
46
|
-
FineGrainedLineageUpstreamTypeClass,
|
|
47
|
-
OperationClass,
|
|
48
|
-
OperationTypeClass,
|
|
49
|
-
StatusClass,
|
|
50
|
-
)
|
|
51
|
-
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
|
|
52
|
-
from datahub.telemetry import telemetry
|
|
53
|
-
|
|
54
|
-
# Import Airflow 3.x specific shims (clean, no cross-version complexity)
|
|
55
|
-
from datahub_airflow_plugin._config import DatahubLineageConfig, get_lineage_config
|
|
56
|
-
from datahub_airflow_plugin._constants import DATAHUB_SQL_PARSING_RESULT_KEY
|
|
57
|
-
from datahub_airflow_plugin._version import __package_name__, __version__
|
|
58
|
-
|
|
59
|
-
# Import Airflow 3.x compatibility and patches before any Airflow imports
|
|
60
|
-
from datahub_airflow_plugin.airflow3 import _airflow_compat # noqa: F401
|
|
61
|
-
from datahub_airflow_plugin.airflow3._shims import (
|
|
62
|
-
OpenLineagePlugin,
|
|
63
|
-
Operator,
|
|
64
|
-
get_task_inlets,
|
|
65
|
-
get_task_outlets,
|
|
66
|
-
redact_with_exclusions,
|
|
67
|
-
)
|
|
68
|
-
from datahub_airflow_plugin.client.airflow_generator import ( # type: ignore[attr-defined]
|
|
69
|
-
AirflowGenerator,
|
|
70
|
-
)
|
|
71
|
-
from datahub_airflow_plugin.entities import (
|
|
72
|
-
_Entity,
|
|
73
|
-
entities_to_datajob_urn_list,
|
|
74
|
-
entities_to_dataset_urn_list,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
# Airflow 3.x always has these APIs
|
|
78
|
-
HAS_AIRFLOW_DAG_LISTENER_API: bool = True
|
|
79
|
-
HAS_AIRFLOW_DATASET_LISTENER_API: bool = True
|
|
80
|
-
|
|
81
|
-
# Airflow 3.0+: No extractors, use OpenLineage native integration
|
|
82
|
-
ExtractorManager = None # type: ignore
|
|
83
|
-
|
|
84
|
-
_F = TypeVar("_F", bound=Callable[..., None])
|
|
85
|
-
if TYPE_CHECKING:
|
|
86
|
-
from airflow.datasets import Dataset
|
|
87
|
-
from airflow.models import DagRun, TaskInstance
|
|
88
|
-
from airflow.sdk.definitions.dag import DAG
|
|
89
|
-
|
|
90
|
-
# To placate mypy on Airflow versions that don't have the listener API,
|
|
91
|
-
# we define a dummy hookimpl that's an identity function.
|
|
92
|
-
|
|
93
|
-
def hookimpl(f: _F) -> _F: # type: ignore[misc]
|
|
94
|
-
return f
|
|
95
|
-
|
|
96
|
-
else:
|
|
97
|
-
from airflow.listeners import hookimpl
|
|
98
|
-
|
|
99
|
-
logger = logging.getLogger(__name__)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def _get_dagrun_from_task_instance(task_instance: "TaskInstance") -> "DagRun":
|
|
103
|
-
"""
|
|
104
|
-
Get a DagRun from a TaskInstance (Airflow 3.x).
|
|
105
|
-
|
|
106
|
-
In Airflow 3.x, RuntimeTaskInstance doesn't have a dag_run attribute, so we create a
|
|
107
|
-
proxy object with the attributes we need.
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
class DagRunProxy:
|
|
111
|
-
"""
|
|
112
|
-
DagRun proxy for Airflow 3.x RuntimeTaskInstance.
|
|
113
|
-
|
|
114
|
-
Provides minimal DagRun interface needed by the listener.
|
|
115
|
-
"""
|
|
116
|
-
|
|
117
|
-
def __init__(self, ti: "TaskInstance"):
|
|
118
|
-
self.ti = ti
|
|
119
|
-
|
|
120
|
-
@property
|
|
121
|
-
def dag(self) -> Any:
|
|
122
|
-
"""Get DAG from task.dag"""
|
|
123
|
-
task = getattr(self.ti, "task", None)
|
|
124
|
-
if task:
|
|
125
|
-
return task.dag
|
|
126
|
-
return None
|
|
127
|
-
|
|
128
|
-
@property
|
|
129
|
-
def dag_id(self) -> Any:
|
|
130
|
-
"""Get dag_id from task instance"""
|
|
131
|
-
return getattr(self.ti, "dag_id", None)
|
|
132
|
-
|
|
133
|
-
@property
|
|
134
|
-
def run_id(self) -> Any:
|
|
135
|
-
"""Get run_id from task instance"""
|
|
136
|
-
return getattr(self.ti, "run_id", None)
|
|
137
|
-
|
|
138
|
-
def __repr__(self) -> str:
|
|
139
|
-
return f"DagRunProxy(dag_id={self.dag_id!r}, run_id={self.run_id!r})"
|
|
140
|
-
|
|
141
|
-
return DagRunProxy(task_instance) # type: ignore[return-value]
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
_airflow_listener_initialized = False
|
|
145
|
-
_airflow_listener: Optional["DataHubListener"] = None
|
|
146
|
-
_airflow_listener_lock = threading.Lock()
|
|
147
|
-
|
|
148
|
-
# Threading is enabled by default for better performance
|
|
149
|
-
# It prevents slow lineage extraction from blocking task completion
|
|
150
|
-
# Can be disabled by setting DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD=false
|
|
151
|
-
_RUN_IN_THREAD = os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD", "true").lower() in (
|
|
152
|
-
"true",
|
|
153
|
-
"1",
|
|
154
|
-
)
|
|
155
|
-
_RUN_IN_THREAD_TIMEOUT = float(
|
|
156
|
-
os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD_TIMEOUT", 10)
|
|
157
|
-
)
|
|
158
|
-
_DATAHUB_CLEANUP_DAG = "Datahub_Cleanup"
|
|
159
|
-
|
|
160
|
-
KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener"
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def get_airflow_plugin_listener() -> Optional["DataHubListener"]:
|
|
164
|
-
"""
|
|
165
|
-
Get or initialize the DataHub listener singleton.
|
|
166
|
-
|
|
167
|
-
Uses double-checked locking pattern for thread-safe lazy initialization.
|
|
168
|
-
This prevents race conditions when multiple worker threads try to initialize
|
|
169
|
-
the listener simultaneously.
|
|
170
|
-
"""
|
|
171
|
-
global _airflow_listener_initialized
|
|
172
|
-
global _airflow_listener
|
|
173
|
-
|
|
174
|
-
# Fast path: if already initialized, return immediately without acquiring lock
|
|
175
|
-
if _airflow_listener_initialized:
|
|
176
|
-
return _airflow_listener
|
|
177
|
-
|
|
178
|
-
# Slow path: acquire lock for initialization
|
|
179
|
-
with _airflow_listener_lock:
|
|
180
|
-
# Double-check: another thread might have initialized while we waited for lock
|
|
181
|
-
if _airflow_listener_initialized:
|
|
182
|
-
return _airflow_listener
|
|
183
|
-
|
|
184
|
-
# Now safe to initialize - we hold the lock and confirmed not initialized
|
|
185
|
-
_airflow_listener_initialized = True
|
|
186
|
-
|
|
187
|
-
plugin_config = get_lineage_config()
|
|
188
|
-
|
|
189
|
-
if plugin_config.enabled:
|
|
190
|
-
_airflow_listener = DataHubListener(config=plugin_config)
|
|
191
|
-
logger.info(
|
|
192
|
-
f"DataHub plugin v2 (package: {__package_name__} and version: {__version__}) listener initialized with config: {plugin_config}"
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
telemetry.telemetry_instance.ping(
|
|
196
|
-
"airflow-plugin-init",
|
|
197
|
-
{
|
|
198
|
-
"airflow-version": airflow.__version__,
|
|
199
|
-
"datahub-airflow-plugin": "v2",
|
|
200
|
-
"datahub-airflow-plugin-dag-events": HAS_AIRFLOW_DAG_LISTENER_API,
|
|
201
|
-
"capture_executions": plugin_config.capture_executions,
|
|
202
|
-
"capture_tags": plugin_config.capture_tags_info,
|
|
203
|
-
"capture_ownership": plugin_config.capture_ownership_info,
|
|
204
|
-
"enable_extractors": plugin_config.enable_extractors,
|
|
205
|
-
"render_templates": plugin_config.render_templates,
|
|
206
|
-
"disable_openlineage_plugin": plugin_config.disable_openlineage_plugin,
|
|
207
|
-
},
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Debug: Log OpenLineage plugin state
|
|
211
|
-
if OpenLineagePlugin is not None:
|
|
212
|
-
logger.debug(
|
|
213
|
-
f"OpenLineage plugin state: listeners={len(getattr(OpenLineagePlugin, 'listeners', []))} items, "
|
|
214
|
-
f"disable_openlineage_plugin={plugin_config.disable_openlineage_plugin}"
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
if plugin_config.disable_openlineage_plugin and OpenLineagePlugin is not None:
|
|
218
|
-
# Deactivate the OpenLineagePlugin listener to avoid conflicts/errors.
|
|
219
|
-
OpenLineagePlugin.listeners = []
|
|
220
|
-
logger.debug("Cleared OpenLineage plugin listeners")
|
|
221
|
-
|
|
222
|
-
return _airflow_listener
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def run_in_thread(f: _F) -> _F:
|
|
226
|
-
# This is also responsible for catching exceptions and logging them.
|
|
227
|
-
|
|
228
|
-
@functools.wraps(f)
|
|
229
|
-
def wrapper(*args, **kwargs):
|
|
230
|
-
def safe_target():
|
|
231
|
-
"""
|
|
232
|
-
Wrapper for the thread target that catches and logs exceptions.
|
|
233
|
-
|
|
234
|
-
Without this, exceptions raised inside the thread would be silently
|
|
235
|
-
lost, making debugging production issues nearly impossible.
|
|
236
|
-
"""
|
|
237
|
-
try:
|
|
238
|
-
f(*args, **kwargs)
|
|
239
|
-
except Exception as e:
|
|
240
|
-
logger.error(
|
|
241
|
-
f"Error in thread executing {f.__name__}: {e}",
|
|
242
|
-
exc_info=True,
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
try:
|
|
246
|
-
if _RUN_IN_THREAD:
|
|
247
|
-
# A poor-man's timeout mechanism.
|
|
248
|
-
# This ensures that we don't hang the task if the extractors
|
|
249
|
-
# are slow or the DataHub API is slow to respond.
|
|
250
|
-
|
|
251
|
-
thread = threading.Thread(target=safe_target, daemon=True)
|
|
252
|
-
thread.start()
|
|
253
|
-
|
|
254
|
-
if _RUN_IN_THREAD_TIMEOUT > 0:
|
|
255
|
-
# If _RUN_IN_THREAD_TIMEOUT is 0, we just kick off the thread and move on.
|
|
256
|
-
# Because it's a daemon thread, it'll be automatically killed when the main
|
|
257
|
-
# thread exists.
|
|
258
|
-
|
|
259
|
-
start_time = time.time()
|
|
260
|
-
thread.join(timeout=_RUN_IN_THREAD_TIMEOUT)
|
|
261
|
-
if thread.is_alive():
|
|
262
|
-
logger.warning(
|
|
263
|
-
f"Thread for {f.__name__} is still running after {_RUN_IN_THREAD_TIMEOUT} seconds. "
|
|
264
|
-
"Continuing without waiting for it to finish."
|
|
265
|
-
)
|
|
266
|
-
else:
|
|
267
|
-
logger.debug(
|
|
268
|
-
f"Thread for {f.__name__} finished after {time.time() - start_time} seconds"
|
|
269
|
-
)
|
|
270
|
-
else:
|
|
271
|
-
f(*args, **kwargs)
|
|
272
|
-
except Exception as e:
|
|
273
|
-
logger.warning(
|
|
274
|
-
f"Error setting up thread for {f.__name__}: {e}",
|
|
275
|
-
exc_info=True,
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
return cast(_F, wrapper)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
def _render_templates(task_instance: "TaskInstance") -> "TaskInstance":
|
|
282
|
-
"""
|
|
283
|
-
Templates are already rendered in Airflow 3.x by the task execution system.
|
|
284
|
-
|
|
285
|
-
RuntimeTaskInstance contains unpickleable thread locks, so we cannot use deepcopy.
|
|
286
|
-
RuntimeTaskInstance.task contains the operator with rendered templates.
|
|
287
|
-
"""
|
|
288
|
-
logger.debug(
|
|
289
|
-
"Skipping template rendering for Airflow 3.0+ (already rendered by task worker)"
|
|
290
|
-
)
|
|
291
|
-
return task_instance
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
class DataHubListener:
|
|
295
|
-
__name__ = "DataHubListener"
|
|
296
|
-
|
|
297
|
-
def __init__(self, config: DatahubLineageConfig):
|
|
298
|
-
self.config = config
|
|
299
|
-
self._set_log_level()
|
|
300
|
-
|
|
301
|
-
# Lazy-load emitter to avoid connection retrieval during plugin initialization
|
|
302
|
-
# Connection retrieval via BaseHook.get_connection() only works during task execution
|
|
303
|
-
# where SUPERVISOR_COMMS is available
|
|
304
|
-
self._emitter: Optional[Emitter] = None
|
|
305
|
-
self._graph: Optional[DataHubGraph] = None
|
|
306
|
-
|
|
307
|
-
# For Airflow 3.0+, we don't need TaskHolder (dict is used as placeholder)
|
|
308
|
-
self._task_holder: Dict[str, Any] = {}
|
|
309
|
-
|
|
310
|
-
# Cache initial datajob objects to merge with completion events
|
|
311
|
-
self._datajob_holder: Dict[str, DataJob] = {}
|
|
312
|
-
|
|
313
|
-
# Airflow 3.0+ doesn't use extractors
|
|
314
|
-
self.extractor_manager = None
|
|
315
|
-
|
|
316
|
-
# This "inherits" from types.ModuleType to avoid issues with Airflow's listener plugin loader.
|
|
317
|
-
# It previously (v2.4.x and likely other versions too) would throw errors if it was not a module.
|
|
318
|
-
# https://github.com/apache/airflow/blob/e99a518970b2d349a75b1647f6b738c8510fa40e/airflow/listeners/listener.py#L56
|
|
319
|
-
# self.__class__ = types.ModuleType
|
|
320
|
-
|
|
321
|
-
def _get_emitter(self):
|
|
322
|
-
"""
|
|
323
|
-
Lazy-load emitter on first use during task execution.
|
|
324
|
-
|
|
325
|
-
This is a method (not a property) to avoid triggering during pluggy's
|
|
326
|
-
attribute introspection when registering the listener.
|
|
327
|
-
|
|
328
|
-
Uses database access to retrieve connection details, avoiding SUPERVISOR_COMMS
|
|
329
|
-
limitations that prevent hook-based methods from working in listener context.
|
|
330
|
-
"""
|
|
331
|
-
if self._emitter is None:
|
|
332
|
-
try:
|
|
333
|
-
self._emitter = self._create_emitter_from_connection()
|
|
334
|
-
if self._emitter:
|
|
335
|
-
logger.debug(
|
|
336
|
-
f"DataHub plugin v2 using {repr(self._emitter)} (created via connection API)"
|
|
337
|
-
)
|
|
338
|
-
else:
|
|
339
|
-
logger.debug(
|
|
340
|
-
"Could not create emitter via DB access - will retry during task execution"
|
|
341
|
-
)
|
|
342
|
-
return None
|
|
343
|
-
except Exception as db_error:
|
|
344
|
-
logger.debug(
|
|
345
|
-
f"Failed to create emitter via DB access: {db_error}. "
|
|
346
|
-
"Will retry during task execution.",
|
|
347
|
-
exc_info=True,
|
|
348
|
-
)
|
|
349
|
-
return None
|
|
350
|
-
return self._emitter
|
|
351
|
-
|
|
352
|
-
def _create_emitter_from_connection(self):
|
|
353
|
-
"""
|
|
354
|
-
Create emitter by retrieving connection details using Airflow's connection API.
|
|
355
|
-
|
|
356
|
-
Uses Connection.get() from SDK which works in all contexts:
|
|
357
|
-
- Task execution (on_task_instance_running, on_task_instance_success, etc.)
|
|
358
|
-
- DAG lifecycle hooks (on_dag_start, on_dag_run_running, etc.)
|
|
359
|
-
- Listener hooks (where SUPERVISOR_COMMS is not available)
|
|
360
|
-
|
|
361
|
-
This method works around the SUPERVISOR_COMMS limitation and Airflow 3.0's
|
|
362
|
-
ORM restriction by using the proper Airflow APIs instead of direct database access.
|
|
363
|
-
Supports datahub-rest, datahub-file, and datahub-kafka connection types.
|
|
364
|
-
Handles multiple comma-separated connection IDs via CompositeEmitter.
|
|
365
|
-
"""
|
|
366
|
-
try:
|
|
367
|
-
# Parse comma-separated connection IDs
|
|
368
|
-
connection_ids = self.config._datahub_connection_ids
|
|
369
|
-
|
|
370
|
-
if len(connection_ids) > 1:
|
|
371
|
-
# Multiple connections - use CompositeEmitter
|
|
372
|
-
emitters = []
|
|
373
|
-
for conn_id in connection_ids:
|
|
374
|
-
emitter = self._create_single_emitter_from_connection(conn_id)
|
|
375
|
-
if emitter:
|
|
376
|
-
emitters.append(emitter)
|
|
377
|
-
|
|
378
|
-
if not emitters:
|
|
379
|
-
logger.warning(
|
|
380
|
-
f"Could not create any emitters from connection IDs: {connection_ids}"
|
|
381
|
-
)
|
|
382
|
-
return None
|
|
383
|
-
|
|
384
|
-
logger.debug(
|
|
385
|
-
f"Created CompositeEmitter with {len(emitters)} emitters from connection IDs: {connection_ids}"
|
|
386
|
-
)
|
|
387
|
-
return CompositeEmitter(emitters)
|
|
388
|
-
else:
|
|
389
|
-
# Single connection
|
|
390
|
-
return self._create_single_emitter_from_connection(connection_ids[0])
|
|
391
|
-
|
|
392
|
-
except Exception as e:
|
|
393
|
-
logger.debug(
|
|
394
|
-
f"Failed to create emitter from connection: {e}", exc_info=True
|
|
395
|
-
)
|
|
396
|
-
return None
|
|
397
|
-
|
|
398
|
-
def _create_single_emitter_from_connection(self, conn_id: str) -> Optional[Emitter]:
|
|
399
|
-
"""
|
|
400
|
-
Create a single emitter from a connection ID.
|
|
401
|
-
|
|
402
|
-
Uses Connection.get() from SDK which works in all contexts:
|
|
403
|
-
- Task execution (on_task_instance_running, on_task_instance_success, etc.)
|
|
404
|
-
- DAG lifecycle hooks (on_dag_start, on_dag_run_running, etc.)
|
|
405
|
-
- Listener hooks (where SUPERVISOR_COMMS is not available)
|
|
406
|
-
|
|
407
|
-
This method checks environment variables, secrets backends, and the database
|
|
408
|
-
through the proper Airflow APIs, avoiding the ORM restriction in Airflow 3.0.
|
|
409
|
-
"""
|
|
410
|
-
try:
|
|
411
|
-
# In Airflow 3.0, direct ORM database access is not allowed during task execution.
|
|
412
|
-
# Use Connection.get() from SDK which works in all contexts.
|
|
413
|
-
# This method checks environment variables, secrets backends, and the database
|
|
414
|
-
# through the proper Airflow APIs.
|
|
415
|
-
conn = Connection.get(conn_id)
|
|
416
|
-
if not conn:
|
|
417
|
-
logger.warning(
|
|
418
|
-
f"Connection '{conn_id}' not found in secrets backend or environment variables"
|
|
419
|
-
)
|
|
420
|
-
return None
|
|
421
|
-
|
|
422
|
-
# Normalize conn_type (handle both dashes and underscores)
|
|
423
|
-
conn_type = (conn.conn_type or "").replace("_", "-")
|
|
424
|
-
|
|
425
|
-
# Handle file-based emitter (used in tests)
|
|
426
|
-
if conn_type == "datahub-file":
|
|
427
|
-
import datahub.emitter.synchronized_file_emitter
|
|
428
|
-
|
|
429
|
-
filename = conn.host
|
|
430
|
-
if not filename:
|
|
431
|
-
logger.warning(
|
|
432
|
-
f"Connection '{conn_id}' is type datahub-file but has no host (filename) configured"
|
|
433
|
-
)
|
|
434
|
-
return None
|
|
435
|
-
|
|
436
|
-
logger.debug(
|
|
437
|
-
f"Retrieved connection '{conn_id}' from secrets: type=datahub-file, filename={filename}"
|
|
438
|
-
)
|
|
439
|
-
return (
|
|
440
|
-
datahub.emitter.synchronized_file_emitter.SynchronizedFileEmitter(
|
|
441
|
-
filename=filename
|
|
442
|
-
)
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
# Handle Kafka-based emitter
|
|
446
|
-
elif conn_type == "datahub-kafka":
|
|
447
|
-
import datahub.emitter.kafka_emitter
|
|
448
|
-
import datahub.ingestion.sink.datahub_kafka
|
|
449
|
-
|
|
450
|
-
obj = conn.extra_dejson or {}
|
|
451
|
-
obj.setdefault("connection", {})
|
|
452
|
-
if conn.host:
|
|
453
|
-
bootstrap = ":".join(map(str, filter(None, [conn.host, conn.port])))
|
|
454
|
-
obj["connection"]["bootstrap"] = bootstrap
|
|
455
|
-
|
|
456
|
-
config = datahub.ingestion.sink.datahub_kafka.KafkaSinkConfig.parse_obj(
|
|
457
|
-
obj
|
|
458
|
-
)
|
|
459
|
-
logger.debug(
|
|
460
|
-
f"Retrieved connection '{conn_id}' from connection API: type=datahub-kafka"
|
|
461
|
-
)
|
|
462
|
-
return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(config)
|
|
463
|
-
|
|
464
|
-
# Handle REST-based emitter (default)
|
|
465
|
-
else:
|
|
466
|
-
import datahub.emitter.rest_emitter
|
|
467
|
-
from datahub.ingestion.graph.config import ClientMode
|
|
468
|
-
|
|
469
|
-
# Build host URL with port if needed
|
|
470
|
-
host = conn.host or ""
|
|
471
|
-
if not host:
|
|
472
|
-
logger.warning(f"Connection '{conn_id}' has no host configured")
|
|
473
|
-
return None
|
|
474
|
-
|
|
475
|
-
# Parse the URL using stdlib urlparse
|
|
476
|
-
parsed = urlparse(host if "://" in host else f"http://{host}")
|
|
477
|
-
|
|
478
|
-
# Add port if specified and not already in URL
|
|
479
|
-
netloc = parsed.netloc
|
|
480
|
-
if conn.port and not parsed.port:
|
|
481
|
-
netloc = f"{parsed.hostname}:{conn.port}"
|
|
482
|
-
|
|
483
|
-
# Reconstruct the URL
|
|
484
|
-
host = urlunparse(
|
|
485
|
-
(
|
|
486
|
-
parsed.scheme or "http",
|
|
487
|
-
netloc,
|
|
488
|
-
parsed.path,
|
|
489
|
-
parsed.params,
|
|
490
|
-
parsed.query,
|
|
491
|
-
parsed.fragment,
|
|
492
|
-
)
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
# Get token - check airflow.cfg first, then connection password
|
|
496
|
-
token = conf.get("datahub", "token", fallback=None)
|
|
497
|
-
if token is None:
|
|
498
|
-
token = conn.password
|
|
499
|
-
|
|
500
|
-
# Get extra args
|
|
501
|
-
extra_args = conn.extra_dejson or {}
|
|
502
|
-
|
|
503
|
-
logger.debug(
|
|
504
|
-
f"Retrieved connection '{conn_id}' from connection API: type={conn_type or 'datahub-rest'}, host={host}, has_token={bool(token)}"
|
|
505
|
-
)
|
|
506
|
-
|
|
507
|
-
return datahub.emitter.rest_emitter.DataHubRestEmitter(
|
|
508
|
-
host,
|
|
509
|
-
token,
|
|
510
|
-
client_mode=ClientMode.INGESTION,
|
|
511
|
-
datahub_component="airflow-plugin",
|
|
512
|
-
**extra_args,
|
|
513
|
-
)
|
|
514
|
-
except Exception as e:
|
|
515
|
-
logger.debug(
|
|
516
|
-
f"Failed to create emitter from connection: {e}", exc_info=True
|
|
517
|
-
)
|
|
518
|
-
return None
|
|
519
|
-
|
|
520
|
-
@property
|
|
521
|
-
def emitter(self):
|
|
522
|
-
"""Compatibility property that delegates to _get_emitter()."""
|
|
523
|
-
result = self._get_emitter()
|
|
524
|
-
if result is None:
|
|
525
|
-
# Retry emitter creation
|
|
526
|
-
self._emitter = None # Reset to force retry
|
|
527
|
-
return self._get_emitter()
|
|
528
|
-
return result
|
|
529
|
-
|
|
530
|
-
@property
|
|
531
|
-
def graph(self) -> Optional[DataHubGraph]:
|
|
532
|
-
if self._graph:
|
|
533
|
-
return self._graph
|
|
534
|
-
|
|
535
|
-
# Use _get_emitter() method to ensure lazy-loading happens first
|
|
536
|
-
emitter = self._get_emitter()
|
|
537
|
-
if emitter is not None:
|
|
538
|
-
import datahub.emitter.rest_emitter
|
|
539
|
-
|
|
540
|
-
if isinstance(
|
|
541
|
-
emitter, datahub.emitter.rest_emitter.DataHubRestEmitter
|
|
542
|
-
) and not isinstance(emitter, DataHubGraph):
|
|
543
|
-
# This is lazy initialized to avoid throwing errors on plugin load.
|
|
544
|
-
self._graph = emitter.to_graph()
|
|
545
|
-
self._emitter = self._graph
|
|
546
|
-
|
|
547
|
-
return self._graph
|
|
548
|
-
|
|
549
|
-
def _set_log_level(self) -> None:
|
|
550
|
-
"""Set the log level for the plugin and its dependencies.
|
|
551
|
-
|
|
552
|
-
This may need to be called multiple times, since Airflow sometimes
|
|
553
|
-
messes with the logging configuration after the plugin is loaded.
|
|
554
|
-
In particular, the loggers may get changed when the worker starts
|
|
555
|
-
executing a task.
|
|
556
|
-
"""
|
|
557
|
-
|
|
558
|
-
if self.config.log_level:
|
|
559
|
-
logging.getLogger(__name__.split(".")[0]).setLevel(self.config.log_level)
|
|
560
|
-
if self.config.debug_emitter:
|
|
561
|
-
logging.getLogger("datahub.emitter").setLevel(logging.DEBUG)
|
|
562
|
-
|
|
563
|
-
def _make_emit_callback(self) -> Callable[[Optional[Exception], str], None]:
|
|
564
|
-
def emit_callback(err: Optional[Exception], msg: str) -> None:
|
|
565
|
-
if err:
|
|
566
|
-
logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err)
|
|
567
|
-
|
|
568
|
-
return emit_callback
|
|
569
|
-
|
|
570
|
-
def _extract_lineage_from_airflow3(
|
|
571
|
-
self,
|
|
572
|
-
task: "Operator",
|
|
573
|
-
task_instance: "TaskInstance",
|
|
574
|
-
complete: bool,
|
|
575
|
-
) -> Tuple[List[str], List[str], Optional[SqlParsingResult]]:
|
|
576
|
-
"""Extract lineage using Airflow 3.x OpenLineage integration."""
|
|
577
|
-
input_urns: List[str] = []
|
|
578
|
-
output_urns: List[str] = []
|
|
579
|
-
sql_parsing_result: Optional[SqlParsingResult] = None
|
|
580
|
-
|
|
581
|
-
logger.debug(
|
|
582
|
-
f"Extracting lineage for task {task.task_id} (complete={complete})"
|
|
583
|
-
)
|
|
584
|
-
logger.debug("Airflow 3.0+: Attempting to get lineage from OpenLineage")
|
|
585
|
-
try:
|
|
586
|
-
from datahub_airflow_plugin._datahub_ol_adapter import (
|
|
587
|
-
translate_ol_to_datahub_urn,
|
|
588
|
-
)
|
|
589
|
-
|
|
590
|
-
# Check if the operator has OpenLineage support
|
|
591
|
-
facet_method_name = (
|
|
592
|
-
"get_openlineage_facets_on_complete"
|
|
593
|
-
if complete
|
|
594
|
-
else "get_openlineage_facets_on_start"
|
|
595
|
-
)
|
|
596
|
-
has_on_complete = hasattr(task, "get_openlineage_facets_on_complete")
|
|
597
|
-
has_on_start = hasattr(task, "get_openlineage_facets_on_start")
|
|
598
|
-
logger.debug(
|
|
599
|
-
f"Task {task.task_id} OpenLineage support: on_complete={has_on_complete}, on_start={has_on_start}, operator_type={type(task).__name__}, required_method={facet_method_name}"
|
|
600
|
-
)
|
|
601
|
-
|
|
602
|
-
if not hasattr(task, facet_method_name):
|
|
603
|
-
logger.debug(
|
|
604
|
-
f"Task {task.task_id} does not have OpenLineage support (missing {facet_method_name}) - SQL parsing will not be triggered"
|
|
605
|
-
)
|
|
606
|
-
return input_urns, output_urns, sql_parsing_result
|
|
607
|
-
|
|
608
|
-
facet_method = getattr(task, facet_method_name)
|
|
609
|
-
|
|
610
|
-
try:
|
|
611
|
-
# Call the appropriate facet method
|
|
612
|
-
operator_lineage = (
|
|
613
|
-
facet_method(task_instance) if complete else facet_method()
|
|
614
|
-
)
|
|
615
|
-
|
|
616
|
-
if not operator_lineage:
|
|
617
|
-
logger.debug(
|
|
618
|
-
f"OpenLineage facet method {facet_method_name} returned None for task {task.task_id} - this is expected for BigQuery when no job_id is found"
|
|
619
|
-
)
|
|
620
|
-
# Even if operator_lineage is None, we might have SQL parsing result from a patch
|
|
621
|
-
# that created a new OperatorLineage. But if it's None, there's nothing to process.
|
|
622
|
-
return input_urns, output_urns, sql_parsing_result
|
|
623
|
-
|
|
624
|
-
logger.debug(
|
|
625
|
-
f"Got OpenLineage operator lineage for task {task.task_id}: inputs={len(operator_lineage.inputs)}, outputs={len(operator_lineage.outputs)}, run_facets_keys={list(operator_lineage.run_facets.keys()) if hasattr(operator_lineage, 'run_facets') else 'N/A'}"
|
|
626
|
-
)
|
|
627
|
-
|
|
628
|
-
# Translate OpenLineage datasets to DataHub URNs
|
|
629
|
-
for ol_dataset in operator_lineage.inputs:
|
|
630
|
-
urn = translate_ol_to_datahub_urn(ol_dataset)
|
|
631
|
-
input_urns.append(urn)
|
|
632
|
-
logger.debug(
|
|
633
|
-
f" Input: {ol_dataset.namespace}/{ol_dataset.name} -> {urn}"
|
|
634
|
-
)
|
|
635
|
-
|
|
636
|
-
for ol_dataset in operator_lineage.outputs:
|
|
637
|
-
urn = translate_ol_to_datahub_urn(ol_dataset)
|
|
638
|
-
output_urns.append(urn)
|
|
639
|
-
logger.debug(
|
|
640
|
-
f" Output: {ol_dataset.namespace}/{ol_dataset.name} -> {urn}"
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
# Check if DataHub SQL parsing result is in run_facets (from our patch)
|
|
644
|
-
logger.debug(
|
|
645
|
-
f"Checking for SQL parsing result in OpenLineage run facets for task {task.task_id}. Key: {DATAHUB_SQL_PARSING_RESULT_KEY}"
|
|
646
|
-
)
|
|
647
|
-
if (
|
|
648
|
-
hasattr(operator_lineage, "run_facets")
|
|
649
|
-
and operator_lineage.run_facets
|
|
650
|
-
):
|
|
651
|
-
logger.debug(
|
|
652
|
-
f"Run facets available: {list(operator_lineage.run_facets.keys())}"
|
|
653
|
-
)
|
|
654
|
-
if DATAHUB_SQL_PARSING_RESULT_KEY in operator_lineage.run_facets:
|
|
655
|
-
sql_parsing_result = operator_lineage.run_facets[
|
|
656
|
-
DATAHUB_SQL_PARSING_RESULT_KEY
|
|
657
|
-
] # type: ignore
|
|
658
|
-
if sql_parsing_result is not None:
|
|
659
|
-
logger.debug(
|
|
660
|
-
f"✓ Found DataHub SQL parsing result for task {task.task_id} with {len(sql_parsing_result.column_lineage or [])} column lineages"
|
|
661
|
-
)
|
|
662
|
-
else:
|
|
663
|
-
logger.debug(
|
|
664
|
-
f"SQL parsing result key exists but value is None for task {task.task_id}"
|
|
665
|
-
)
|
|
666
|
-
else:
|
|
667
|
-
logger.debug(
|
|
668
|
-
f"SQL parsing result key '{DATAHUB_SQL_PARSING_RESULT_KEY}' not found in run_facets for task {task.task_id}"
|
|
669
|
-
)
|
|
670
|
-
else:
|
|
671
|
-
logger.debug(
|
|
672
|
-
f"No run_facets available in operator_lineage for task {task.task_id}"
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
except Exception as e:
|
|
676
|
-
logger.debug(
|
|
677
|
-
f"Error calling OpenLineage facet method: {e}", exc_info=True
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
except Exception as e:
|
|
681
|
-
logger.warning(
|
|
682
|
-
f"Error extracting lineage from OpenLineage: {e}", exc_info=True
|
|
683
|
-
)
|
|
684
|
-
|
|
685
|
-
return input_urns, output_urns, sql_parsing_result
|
|
686
|
-
|
|
687
|
-
def _process_sql_parsing_result(
|
|
688
|
-
self,
|
|
689
|
-
datajob: DataJob,
|
|
690
|
-
sql_parsing_result: Optional[SqlParsingResult],
|
|
691
|
-
) -> Tuple[List[str], List[str], List[FineGrainedLineageClass]]:
|
|
692
|
-
"""Process SQL parsing result and return additional URNs and column lineage."""
|
|
693
|
-
input_urns: List[str] = []
|
|
694
|
-
output_urns: List[str] = []
|
|
695
|
-
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
696
|
-
|
|
697
|
-
if not sql_parsing_result:
|
|
698
|
-
logger.debug(
|
|
699
|
-
f"No SQL parsing result available for task {datajob.urn} - lineage may be incomplete"
|
|
700
|
-
)
|
|
701
|
-
return input_urns, output_urns, fine_grained_lineages
|
|
702
|
-
|
|
703
|
-
# Log parsing result summary for debugging
|
|
704
|
-
logger.debug(
|
|
705
|
-
f"Processing SQL parsing result for task {datajob.urn}: "
|
|
706
|
-
f"in_tables={len(sql_parsing_result.in_tables)}, "
|
|
707
|
-
f"out_tables={len(sql_parsing_result.out_tables)}, "
|
|
708
|
-
f"column_lineage={len(sql_parsing_result.column_lineage or [])}, "
|
|
709
|
-
f"table_error={sql_parsing_result.debug_info.table_error}, "
|
|
710
|
-
f"error={sql_parsing_result.debug_info.error}"
|
|
711
|
-
)
|
|
712
|
-
|
|
713
|
-
if error := sql_parsing_result.debug_info.error:
|
|
714
|
-
logger.warning(
|
|
715
|
-
f"SQL parsing error for task {datajob.urn}: {error}", exc_info=error
|
|
716
|
-
)
|
|
717
|
-
datajob.properties["datahub_sql_parser_error"] = (
|
|
718
|
-
f"{type(error).__name__}: {error}"
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
if not sql_parsing_result.debug_info.table_error:
|
|
722
|
-
input_urns.extend(sql_parsing_result.in_tables)
|
|
723
|
-
output_urns.extend(sql_parsing_result.out_tables)
|
|
724
|
-
|
|
725
|
-
if sql_parsing_result.column_lineage:
|
|
726
|
-
# Create FGLs from column_lineage items
|
|
727
|
-
# Duplicates will be caught by sql_fine_grained_lineages deduplication below
|
|
728
|
-
fine_grained_lineages.extend(
|
|
729
|
-
FineGrainedLineageClass(
|
|
730
|
-
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
731
|
-
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
732
|
-
upstreams=[
|
|
733
|
-
builder.make_schema_field_urn(
|
|
734
|
-
upstream.table, upstream.column
|
|
735
|
-
)
|
|
736
|
-
for upstream in column_lineage.upstreams
|
|
737
|
-
],
|
|
738
|
-
downstreams=[
|
|
739
|
-
builder.make_schema_field_urn(
|
|
740
|
-
downstream.table, downstream.column
|
|
741
|
-
)
|
|
742
|
-
for downstream in [column_lineage.downstream]
|
|
743
|
-
if downstream.table
|
|
744
|
-
],
|
|
745
|
-
)
|
|
746
|
-
for column_lineage in sql_parsing_result.column_lineage
|
|
747
|
-
)
|
|
748
|
-
logger.debug(
|
|
749
|
-
f"Created {len(fine_grained_lineages)} FGLs from {len(sql_parsing_result.column_lineage)} column_lineage items for task {datajob.urn}"
|
|
750
|
-
)
|
|
751
|
-
else:
|
|
752
|
-
logger.warning(
|
|
753
|
-
f"SQL parsing table error for task {datajob.urn}: {sql_parsing_result.debug_info.table_error}"
|
|
754
|
-
)
|
|
755
|
-
|
|
756
|
-
return input_urns, output_urns, fine_grained_lineages
|
|
757
|
-
|
|
758
|
-
def _extract_lineage(
|
|
759
|
-
self,
|
|
760
|
-
datajob: DataJob,
|
|
761
|
-
dagrun: "DagRun",
|
|
762
|
-
task: "Operator",
|
|
763
|
-
task_instance: "TaskInstance",
|
|
764
|
-
complete: bool = False,
|
|
765
|
-
) -> None:
|
|
766
|
-
"""
|
|
767
|
-
Combine lineage (including column lineage) from task inlets/outlets and
|
|
768
|
-
extractor-generated task_metadata and write it to the datajob. This
|
|
769
|
-
routine is also responsible for converting the lineage to DataHub URNs.
|
|
770
|
-
"""
|
|
771
|
-
logger.debug(
|
|
772
|
-
f"_extract_lineage called for task {task.task_id} (complete={complete}, enable_datajob_lineage={self.config.enable_datajob_lineage})"
|
|
773
|
-
)
|
|
774
|
-
if not self.config.enable_datajob_lineage:
|
|
775
|
-
logger.debug(
|
|
776
|
-
f"Skipping lineage extraction for task {task.task_id} - enable_datajob_lineage is False"
|
|
777
|
-
)
|
|
778
|
-
return
|
|
779
|
-
|
|
780
|
-
input_urns: List[str] = []
|
|
781
|
-
output_urns: List[str] = []
|
|
782
|
-
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
783
|
-
|
|
784
|
-
# For completion events, start with empty FGLs to avoid accumulating duplicates
|
|
785
|
-
if complete and datajob.fine_grained_lineages:
|
|
786
|
-
datajob.fine_grained_lineages = []
|
|
787
|
-
|
|
788
|
-
task_metadata = None
|
|
789
|
-
sql_parsing_result: Optional[SqlParsingResult] = None
|
|
790
|
-
|
|
791
|
-
# Extract lineage using Airflow 3.x OpenLineage integration
|
|
792
|
-
logger.debug(f"Calling _extract_lineage_from_airflow3 for task {task.task_id}")
|
|
793
|
-
extracted_input_urns, extracted_output_urns, sql_parsing_result = (
|
|
794
|
-
self._extract_lineage_from_airflow3(task, task_instance, complete)
|
|
795
|
-
)
|
|
796
|
-
logger.debug(
|
|
797
|
-
f"Lineage extraction result for task {task.task_id}: inputs={len(extracted_input_urns)}, outputs={len(extracted_output_urns)}, sql_parsing_result={'present' if sql_parsing_result else 'None'}"
|
|
798
|
-
)
|
|
799
|
-
input_urns.extend(extracted_input_urns)
|
|
800
|
-
output_urns.extend(extracted_output_urns)
|
|
801
|
-
|
|
802
|
-
# Process SQL parsing result
|
|
803
|
-
sql_input_urns, sql_output_urns, sql_fine_grained_lineages = (
|
|
804
|
-
self._process_sql_parsing_result(datajob, sql_parsing_result)
|
|
805
|
-
)
|
|
806
|
-
input_urns.extend(sql_input_urns)
|
|
807
|
-
output_urns.extend(sql_output_urns)
|
|
808
|
-
|
|
809
|
-
# Deduplicate within sql_fine_grained_lineages before adding to fine_grained_lineages
|
|
810
|
-
# This prevents duplicates from SQL parsing result itself
|
|
811
|
-
if sql_fine_grained_lineages:
|
|
812
|
-
seen_sql_fgl_keys = {}
|
|
813
|
-
unique_sql_fgls = []
|
|
814
|
-
for fgl in sql_fine_grained_lineages:
|
|
815
|
-
fgl_key = (
|
|
816
|
-
tuple(sorted(fgl.upstreams)) if fgl.upstreams else (),
|
|
817
|
-
tuple(sorted(fgl.downstreams)) if fgl.downstreams else (),
|
|
818
|
-
fgl.upstreamType,
|
|
819
|
-
fgl.downstreamType,
|
|
820
|
-
)
|
|
821
|
-
if fgl_key not in seen_sql_fgl_keys:
|
|
822
|
-
seen_sql_fgl_keys[fgl_key] = fgl
|
|
823
|
-
unique_sql_fgls.append(fgl)
|
|
824
|
-
|
|
825
|
-
if len(unique_sql_fgls) != len(sql_fine_grained_lineages):
|
|
826
|
-
logger.debug(
|
|
827
|
-
f"Deduplicated SQL parsing FGLs: {len(sql_fine_grained_lineages)} -> {len(unique_sql_fgls)} for task {datajob.urn}"
|
|
828
|
-
)
|
|
829
|
-
sql_fine_grained_lineages = unique_sql_fgls
|
|
830
|
-
|
|
831
|
-
fine_grained_lineages.extend(sql_fine_grained_lineages)
|
|
832
|
-
|
|
833
|
-
# Add DataHub-native inlets/outlets
|
|
834
|
-
input_urns.extend(
|
|
835
|
-
iolet.urn for iolet in get_task_inlets(task) if isinstance(iolet, _Entity)
|
|
836
|
-
)
|
|
837
|
-
output_urns.extend(
|
|
838
|
-
iolet.urn for iolet in get_task_outlets(task) if isinstance(iolet, _Entity)
|
|
839
|
-
)
|
|
840
|
-
|
|
841
|
-
# Write the lineage to the datajob object
|
|
842
|
-
datajob.inlets.extend(entities_to_dataset_urn_list(input_urns))
|
|
843
|
-
datajob.outlets.extend(entities_to_dataset_urn_list(output_urns))
|
|
844
|
-
datajob.upstream_urns.extend(entities_to_datajob_urn_list(input_urns))
|
|
845
|
-
|
|
846
|
-
# Set fine_grained_lineages - already deduplicated (sql_fine_grained_lineages)
|
|
847
|
-
datajob.fine_grained_lineages = fine_grained_lineages
|
|
848
|
-
|
|
849
|
-
# Merge with datajob from task start (if this is task completion)
|
|
850
|
-
if complete:
|
|
851
|
-
original_datajob = self._datajob_holder.get(str(datajob.urn), None)
|
|
852
|
-
else:
|
|
853
|
-
self._datajob_holder[str(datajob.urn)] = datajob
|
|
854
|
-
original_datajob = None
|
|
855
|
-
|
|
856
|
-
if original_datajob:
|
|
857
|
-
logger.debug("Merging start datajob into finish datajob")
|
|
858
|
-
datajob.inlets.extend(original_datajob.inlets)
|
|
859
|
-
datajob.outlets.extend(original_datajob.outlets)
|
|
860
|
-
datajob.upstream_urns.extend(original_datajob.upstream_urns)
|
|
861
|
-
# Don't merge fine_grained_lineages from start - completion lineage is complete and accurate
|
|
862
|
-
# This avoids duplicates when SQLParser extracts lineage on both start and completion
|
|
863
|
-
|
|
864
|
-
for k, v in original_datajob.properties.items():
|
|
865
|
-
datajob.properties.setdefault(k, v)
|
|
866
|
-
|
|
867
|
-
# Deduplicate inlets/outlets
|
|
868
|
-
datajob.inlets = list(sorted(set(datajob.inlets), key=lambda x: str(x)))
|
|
869
|
-
datajob.outlets = list(sorted(set(datajob.outlets), key=lambda x: str(x)))
|
|
870
|
-
datajob.upstream_urns = list(
|
|
871
|
-
sorted(set(datajob.upstream_urns), key=lambda x: str(x))
|
|
872
|
-
)
|
|
873
|
-
|
|
874
|
-
# Write all other OL facets as DataHub properties
|
|
875
|
-
if task_metadata:
|
|
876
|
-
for k, v in task_metadata.job_facets.items():
|
|
877
|
-
datajob.properties[f"openlineage_job_facet_{k}"] = Serde.to_json(
|
|
878
|
-
redact_with_exclusions(v) # type: ignore[arg-type]
|
|
879
|
-
)
|
|
880
|
-
|
|
881
|
-
for k, v in task_metadata.run_facets.items():
|
|
882
|
-
datajob.properties[f"openlineage_run_facet_{k}"] = Serde.to_json(
|
|
883
|
-
redact_with_exclusions(v) # type: ignore[arg-type]
|
|
884
|
-
)
|
|
885
|
-
|
|
886
|
-
def check_kill_switch(self) -> bool:
|
|
887
|
-
"""
|
|
888
|
-
Check kill switch for Airflow 3.0+.
|
|
889
|
-
|
|
890
|
-
Variable.get() cannot be called from listener hooks in Airflow 3.0+
|
|
891
|
-
because it creates a database session commit which breaks HA locks.
|
|
892
|
-
Use environment variable instead.
|
|
893
|
-
"""
|
|
894
|
-
if (
|
|
895
|
-
os.getenv(
|
|
896
|
-
f"AIRFLOW_VAR_{KILL_SWITCH_VARIABLE_NAME}".upper(), "false"
|
|
897
|
-
).lower()
|
|
898
|
-
== "true"
|
|
899
|
-
):
|
|
900
|
-
logger.debug("DataHub listener disabled by kill switch (env var)")
|
|
901
|
-
return True
|
|
902
|
-
return False
|
|
903
|
-
|
|
904
|
-
def _prepare_task_context(
|
|
905
|
-
self, task_instance: "TaskInstance", for_completion: bool = False
|
|
906
|
-
) -> Optional[Tuple["DagRun", "Operator", "DAG"]]:
|
|
907
|
-
"""
|
|
908
|
-
Prepare task context by extracting DAG run, task, and DAG from task instance.
|
|
909
|
-
|
|
910
|
-
Args:
|
|
911
|
-
task_instance: The Airflow task instance
|
|
912
|
-
for_completion: If True, retrieves task from holder for completion events
|
|
913
|
-
|
|
914
|
-
Returns:
|
|
915
|
-
Tuple of (dagrun, task, dag) or None if context cannot be prepared
|
|
916
|
-
"""
|
|
917
|
-
# Get dagrun in a version-compatible way (Airflow 2.x vs 3.x)
|
|
918
|
-
dagrun: "DagRun" = _get_dagrun_from_task_instance(task_instance)
|
|
919
|
-
|
|
920
|
-
if self.config.render_templates:
|
|
921
|
-
task_instance = _render_templates(task_instance)
|
|
922
|
-
|
|
923
|
-
# Get task - should be directly available on task_instance
|
|
924
|
-
task = task_instance.task if hasattr(task_instance, "task") else None
|
|
925
|
-
|
|
926
|
-
if task is None:
|
|
927
|
-
return None
|
|
928
|
-
|
|
929
|
-
dag: "DAG" = task.dag # type: ignore[assignment]
|
|
930
|
-
|
|
931
|
-
# Check if DAG is allowed by filter pattern
|
|
932
|
-
if not self.config.dag_filter_pattern.allowed(dag.dag_id):
|
|
933
|
-
logger.debug(f"DAG {dag.dag_id} is not allowed by the pattern")
|
|
934
|
-
return None
|
|
935
|
-
|
|
936
|
-
# Task type can vary between Airflow versions (MappedOperator, SerializedBaseOperator, etc.)
|
|
937
|
-
return dagrun, task, dag # type: ignore[return-value]
|
|
938
|
-
|
|
939
|
-
def _generate_and_emit_datajob(
|
|
940
|
-
self,
|
|
941
|
-
dagrun: "DagRun",
|
|
942
|
-
task: "Operator",
|
|
943
|
-
dag: "DAG",
|
|
944
|
-
task_instance: "TaskInstance",
|
|
945
|
-
complete: bool = False,
|
|
946
|
-
) -> DataJob:
|
|
947
|
-
"""
|
|
948
|
-
Generate DataJob with lineage and emit it to DataHub.
|
|
949
|
-
|
|
950
|
-
Args:
|
|
951
|
-
dagrun: The DAG run
|
|
952
|
-
task: The task operator
|
|
953
|
-
dag: The DAG
|
|
954
|
-
task_instance: The task instance
|
|
955
|
-
complete: Whether this is for task completion
|
|
956
|
-
|
|
957
|
-
Returns:
|
|
958
|
-
The generated DataJob
|
|
959
|
-
"""
|
|
960
|
-
# Check if emitter is available
|
|
961
|
-
emitter = self._get_emitter()
|
|
962
|
-
if emitter is None:
|
|
963
|
-
logger.warning(
|
|
964
|
-
f"DataHub emitter not available for task {task.task_id}, skipping metadata emission"
|
|
965
|
-
)
|
|
966
|
-
# Still generate the datajob for tracking purposes, but don't emit
|
|
967
|
-
datajob = AirflowGenerator.generate_datajob(
|
|
968
|
-
cluster=self.config.cluster,
|
|
969
|
-
task=task, # type: ignore[arg-type]
|
|
970
|
-
dag=dag,
|
|
971
|
-
capture_tags=self.config.capture_tags_info,
|
|
972
|
-
capture_owner=self.config.capture_ownership_info,
|
|
973
|
-
config=self.config,
|
|
974
|
-
)
|
|
975
|
-
self._extract_lineage(
|
|
976
|
-
datajob, dagrun, task, task_instance, complete=complete
|
|
977
|
-
) # type: ignore[arg-type]
|
|
978
|
-
return datajob
|
|
979
|
-
|
|
980
|
-
datajob = AirflowGenerator.generate_datajob(
|
|
981
|
-
cluster=self.config.cluster,
|
|
982
|
-
task=task, # type: ignore[arg-type]
|
|
983
|
-
dag=dag,
|
|
984
|
-
capture_tags=self.config.capture_tags_info,
|
|
985
|
-
capture_owner=self.config.capture_ownership_info,
|
|
986
|
-
config=self.config,
|
|
987
|
-
)
|
|
988
|
-
|
|
989
|
-
# Add lineage info
|
|
990
|
-
self._extract_lineage(datajob, dagrun, task, task_instance, complete=complete) # type: ignore[arg-type]
|
|
991
|
-
|
|
992
|
-
# Emit DataJob MCPs
|
|
993
|
-
# Skip dataJobInputOutput aspects on task start to avoid file emitter merging duplicates
|
|
994
|
-
# The file emitter merges aspects with the same entity URN and aspect name,
|
|
995
|
-
# which causes FGLs from start and completion to be combined into duplicates.
|
|
996
|
-
# We only emit the aspect on completion when lineage is complete and accurate.
|
|
997
|
-
for mcp in datajob.generate_mcp(
|
|
998
|
-
generate_lineage=self.config.enable_datajob_lineage,
|
|
999
|
-
materialize_iolets=self.config.materialize_iolets,
|
|
1000
|
-
):
|
|
1001
|
-
# Skip dataJobInputOutput aspects on task start
|
|
1002
|
-
if not complete:
|
|
1003
|
-
if isinstance(mcp.aspect, DataJobInputOutputClass):
|
|
1004
|
-
logger.debug(
|
|
1005
|
-
f"Skipping dataJobInputOutput for task {task.task_id} on start "
|
|
1006
|
-
f"(will be emitted on completion to avoid file emitter merging duplicates)"
|
|
1007
|
-
)
|
|
1008
|
-
continue
|
|
1009
|
-
|
|
1010
|
-
emitter.emit(mcp, self._make_emit_callback())
|
|
1011
|
-
|
|
1012
|
-
status_text = f"finish w/ status {complete}" if complete else "start"
|
|
1013
|
-
logger.debug(f"Emitted DataHub Datajob {status_text}: {datajob}")
|
|
1014
|
-
|
|
1015
|
-
return datajob
|
|
1016
|
-
|
|
1017
|
-
@hookimpl
|
|
1018
|
-
@run_in_thread
|
|
1019
|
-
def on_task_instance_running( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
|
|
1020
|
-
self, previous_state, task_instance: "TaskInstance", **kwargs
|
|
1021
|
-
) -> None:
|
|
1022
|
-
# In Airflow 3.0, the session parameter was removed from the hook signature
|
|
1023
|
-
if self.check_kill_switch():
|
|
1024
|
-
return
|
|
1025
|
-
self._set_log_level()
|
|
1026
|
-
|
|
1027
|
-
# This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508.
|
|
1028
|
-
if not hasattr(task_instance, "task"):
|
|
1029
|
-
logger.warning(
|
|
1030
|
-
f"No task set for task_id: {task_instance.task_id} - " # type: ignore[attr-defined]
|
|
1031
|
-
f"dag_id: {task_instance.dag_id} - run_id {task_instance.run_id}" # type: ignore[attr-defined]
|
|
1032
|
-
)
|
|
1033
|
-
return
|
|
1034
|
-
|
|
1035
|
-
logger.debug(
|
|
1036
|
-
f"DataHub listener got notification about task instance start for {task_instance.task_id} of dag {task_instance.dag_id}"
|
|
1037
|
-
)
|
|
1038
|
-
|
|
1039
|
-
# Check if DAG is allowed before doing any expensive operations
|
|
1040
|
-
if not self.config.dag_filter_pattern.allowed(task_instance.dag_id):
|
|
1041
|
-
logger.debug(f"DAG {task_instance.dag_id} is not allowed by the pattern")
|
|
1042
|
-
return
|
|
1043
|
-
|
|
1044
|
-
# Handle async operators by skipping deferred state
|
|
1045
|
-
if (
|
|
1046
|
-
hasattr(task_instance, "next_method")
|
|
1047
|
-
and task_instance.next_method is not None
|
|
1048
|
-
):
|
|
1049
|
-
return
|
|
1050
|
-
|
|
1051
|
-
# Render templates and extract context
|
|
1052
|
-
if self.config.render_templates:
|
|
1053
|
-
task_instance = _render_templates(task_instance)
|
|
1054
|
-
|
|
1055
|
-
dagrun: "DagRun" = _get_dagrun_from_task_instance(task_instance)
|
|
1056
|
-
task = task_instance.task
|
|
1057
|
-
assert task is not None
|
|
1058
|
-
dag: "DAG" = task.dag # type: ignore[assignment]
|
|
1059
|
-
|
|
1060
|
-
# Airflow 3.0+ doesn't need task holder
|
|
1061
|
-
|
|
1062
|
-
# If we don't have the DAG listener API, emit DAG start event
|
|
1063
|
-
if not HAS_AIRFLOW_DAG_LISTENER_API:
|
|
1064
|
-
self.on_dag_start(dagrun)
|
|
1065
|
-
|
|
1066
|
-
# Generate and emit datajob
|
|
1067
|
-
# Task type can vary between Airflow versions (MappedOperator from different modules)
|
|
1068
|
-
datajob = self._generate_and_emit_datajob(
|
|
1069
|
-
dagrun,
|
|
1070
|
-
task, # type: ignore[arg-type]
|
|
1071
|
-
dag,
|
|
1072
|
-
task_instance,
|
|
1073
|
-
complete=False,
|
|
1074
|
-
)
|
|
1075
|
-
|
|
1076
|
-
# Emit process instance if capturing executions
|
|
1077
|
-
emitter = self._get_emitter()
|
|
1078
|
-
if self.config.capture_executions and emitter:
|
|
1079
|
-
dpi = AirflowGenerator.run_datajob(
|
|
1080
|
-
emitter=emitter,
|
|
1081
|
-
config=self.config,
|
|
1082
|
-
ti=task_instance,
|
|
1083
|
-
dag=dag,
|
|
1084
|
-
dag_run=dagrun,
|
|
1085
|
-
datajob=datajob,
|
|
1086
|
-
emit_templates=False,
|
|
1087
|
-
)
|
|
1088
|
-
logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}")
|
|
1089
|
-
|
|
1090
|
-
if emitter:
|
|
1091
|
-
emitter.flush()
|
|
1092
|
-
|
|
1093
|
-
logger.debug(
|
|
1094
|
-
f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}"
|
|
1095
|
-
)
|
|
1096
|
-
|
|
1097
|
-
self.materialize_iolets(datajob)
|
|
1098
|
-
|
|
1099
|
-
def materialize_iolets(self, datajob: DataJob) -> None:
|
|
1100
|
-
if self.config.materialize_iolets:
|
|
1101
|
-
emitter = self._get_emitter()
|
|
1102
|
-
if emitter is None:
|
|
1103
|
-
logger.warning(
|
|
1104
|
-
"DataHub emitter not available, skipping iolet materialization"
|
|
1105
|
-
)
|
|
1106
|
-
return
|
|
1107
|
-
|
|
1108
|
-
for outlet in datajob.outlets:
|
|
1109
|
-
reported_time: int = int(time.time() * 1000)
|
|
1110
|
-
operation = OperationClass(
|
|
1111
|
-
timestampMillis=reported_time,
|
|
1112
|
-
operationType=OperationTypeClass.CREATE,
|
|
1113
|
-
lastUpdatedTimestamp=reported_time,
|
|
1114
|
-
actor=builder.make_user_urn("airflow"),
|
|
1115
|
-
)
|
|
1116
|
-
|
|
1117
|
-
operation_mcp = MetadataChangeProposalWrapper(
|
|
1118
|
-
entityUrn=str(outlet), aspect=operation
|
|
1119
|
-
)
|
|
1120
|
-
|
|
1121
|
-
emitter.emit(operation_mcp)
|
|
1122
|
-
logger.debug(f"Emitted Dataset Operation: {outlet}")
|
|
1123
|
-
else:
|
|
1124
|
-
if self.graph:
|
|
1125
|
-
for outlet in datajob.outlets:
|
|
1126
|
-
if not self.graph.exists(str(outlet)):
|
|
1127
|
-
logger.warning(f"Dataset {str(outlet)} not materialized")
|
|
1128
|
-
for inlet in datajob.inlets:
|
|
1129
|
-
if not self.graph.exists(str(inlet)):
|
|
1130
|
-
logger.warning(f"Dataset {str(inlet)} not materialized")
|
|
1131
|
-
|
|
1132
|
-
def on_task_instance_finish(
|
|
1133
|
-
self, task_instance: "TaskInstance", status: InstanceRunResult
|
|
1134
|
-
) -> None:
|
|
1135
|
-
logger.debug(
|
|
1136
|
-
f"on_task_instance_finish called for task {task_instance.task_id} (dag_id={task_instance.dag_id}, status={status})"
|
|
1137
|
-
)
|
|
1138
|
-
# Prepare task context (handles template rendering, task retrieval, DAG filtering)
|
|
1139
|
-
context = self._prepare_task_context(task_instance, for_completion=True)
|
|
1140
|
-
if context is None:
|
|
1141
|
-
logger.debug(
|
|
1142
|
-
f"Task context preparation returned None for task {task_instance.task_id}"
|
|
1143
|
-
)
|
|
1144
|
-
return
|
|
1145
|
-
|
|
1146
|
-
dagrun, task, dag = context
|
|
1147
|
-
logger.debug(
|
|
1148
|
-
f"Task context prepared for task {task_instance.task_id}: task_type={type(task).__name__}"
|
|
1149
|
-
)
|
|
1150
|
-
|
|
1151
|
-
# Generate and emit datajob with lineage
|
|
1152
|
-
logger.debug(
|
|
1153
|
-
f"Generating and emitting DataJob for task {task_instance.task_id} (complete=True)"
|
|
1154
|
-
)
|
|
1155
|
-
datajob = self._generate_and_emit_datajob(
|
|
1156
|
-
dagrun, task, dag, task_instance, complete=True
|
|
1157
|
-
)
|
|
1158
|
-
|
|
1159
|
-
# Emit process instance if capturing executions
|
|
1160
|
-
emitter = self._get_emitter()
|
|
1161
|
-
if self.config.capture_executions and emitter:
|
|
1162
|
-
dpi = AirflowGenerator.complete_datajob(
|
|
1163
|
-
emitter=emitter,
|
|
1164
|
-
cluster=self.config.cluster,
|
|
1165
|
-
ti=task_instance,
|
|
1166
|
-
dag=dag,
|
|
1167
|
-
dag_run=dagrun,
|
|
1168
|
-
datajob=datajob,
|
|
1169
|
-
result=status,
|
|
1170
|
-
config=self.config,
|
|
1171
|
-
)
|
|
1172
|
-
logger.debug(
|
|
1173
|
-
f"Emitted DataHub DataProcess Instance with status {status}: {dpi}"
|
|
1174
|
-
)
|
|
1175
|
-
# Emit inlet/outlet aspects for DataProcessInstance (emit_process_end only emits run event)
|
|
1176
|
-
# This matches the behavior of emit_process_start which calls generate_mcp()
|
|
1177
|
-
for mcp in dpi.generate_inlet_outlet_mcp(materialize_iolets=False):
|
|
1178
|
-
emitter.emit(mcp, self._make_emit_callback())
|
|
1179
|
-
|
|
1180
|
-
if emitter:
|
|
1181
|
-
emitter.flush()
|
|
1182
|
-
|
|
1183
|
-
@hookimpl
|
|
1184
|
-
@run_in_thread
|
|
1185
|
-
def on_task_instance_success( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
|
|
1186
|
-
self, previous_state, task_instance: "TaskInstance", **kwargs
|
|
1187
|
-
) -> None:
|
|
1188
|
-
logger.debug(
|
|
1189
|
-
f"on_task_instance_success hook called for task {task_instance.task_id} (dag_id={task_instance.dag_id})"
|
|
1190
|
-
)
|
|
1191
|
-
if self.check_kill_switch():
|
|
1192
|
-
logger.debug(
|
|
1193
|
-
f"Skipping task {task_instance.task_id} - kill switch is enabled"
|
|
1194
|
-
)
|
|
1195
|
-
return
|
|
1196
|
-
|
|
1197
|
-
self._set_log_level()
|
|
1198
|
-
|
|
1199
|
-
logger.debug(
|
|
1200
|
-
f"DataHub listener got notification about task instance success for {task_instance.task_id}"
|
|
1201
|
-
)
|
|
1202
|
-
self.on_task_instance_finish(task_instance, status=InstanceRunResult.SUCCESS)
|
|
1203
|
-
logger.debug(
|
|
1204
|
-
f"DataHub listener finished processing task instance success for {task_instance.task_id}"
|
|
1205
|
-
)
|
|
1206
|
-
|
|
1207
|
-
@hookimpl
|
|
1208
|
-
@run_in_thread
|
|
1209
|
-
def on_task_instance_failed( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
|
|
1210
|
-
self, previous_state, task_instance: "TaskInstance", **kwargs
|
|
1211
|
-
) -> None:
|
|
1212
|
-
if self.check_kill_switch():
|
|
1213
|
-
return
|
|
1214
|
-
|
|
1215
|
-
self._set_log_level()
|
|
1216
|
-
|
|
1217
|
-
logger.debug(
|
|
1218
|
-
f"DataHub listener got notification about task instance failure for {task_instance.task_id}"
|
|
1219
|
-
)
|
|
1220
|
-
|
|
1221
|
-
# TODO: Handle UP_FOR_RETRY state.
|
|
1222
|
-
# TODO: Use the error parameter (available in kwargs for Airflow 3.0+) for better error reporting
|
|
1223
|
-
self.on_task_instance_finish(task_instance, status=InstanceRunResult.FAILURE)
|
|
1224
|
-
logger.debug(
|
|
1225
|
-
f"DataHub listener finished processing task instance failure for {task_instance.task_id}"
|
|
1226
|
-
)
|
|
1227
|
-
|
|
1228
|
-
def on_dag_start(self, dag_run: "DagRun") -> None: # type: ignore[no-untyped-def]
|
|
1229
|
-
logger.debug(
|
|
1230
|
-
f"DataHub on_dag_start called for dag_id={dag_run.dag_id}, run_id={dag_run.run_id}"
|
|
1231
|
-
)
|
|
1232
|
-
dag = dag_run.dag
|
|
1233
|
-
if not dag:
|
|
1234
|
-
logger.warning(
|
|
1235
|
-
f"DataHub listener could not find DAG for {dag_run.dag_id} - {dag_run.run_id}. Dag won't be captured"
|
|
1236
|
-
)
|
|
1237
|
-
return
|
|
1238
|
-
|
|
1239
|
-
logger.debug(f"Generating DataFlow for DAG: {dag.dag_id}")
|
|
1240
|
-
dataflow = AirflowGenerator.generate_dataflow(
|
|
1241
|
-
config=self.config,
|
|
1242
|
-
dag=dag, # type: ignore[arg-type]
|
|
1243
|
-
)
|
|
1244
|
-
logger.debug(
|
|
1245
|
-
f"Generated DataFlow URN: {dataflow.urn}, tags: {dataflow.tags}, description: {dataflow.description}"
|
|
1246
|
-
)
|
|
1247
|
-
|
|
1248
|
-
# Ensure emitter is initialized
|
|
1249
|
-
emitter = self._get_emitter()
|
|
1250
|
-
if emitter is None:
|
|
1251
|
-
logger.warning("DataHub emitter not available, skipping DataFlow emission")
|
|
1252
|
-
return
|
|
1253
|
-
|
|
1254
|
-
# Emit dataflow
|
|
1255
|
-
logger.debug(f"Emitting DataFlow MCPs for {dataflow.urn}")
|
|
1256
|
-
dataflow.emit(emitter, callback=self._make_emit_callback())
|
|
1257
|
-
|
|
1258
|
-
event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
|
|
1259
|
-
entityUrn=str(dataflow.urn), aspect=StatusClass(removed=False)
|
|
1260
|
-
)
|
|
1261
|
-
emitter.emit(event)
|
|
1262
|
-
|
|
1263
|
-
for task in dag.tasks:
|
|
1264
|
-
task_urn = builder.make_data_job_urn_with_flow(
|
|
1265
|
-
str(dataflow.urn), task.task_id
|
|
1266
|
-
)
|
|
1267
|
-
event = MetadataChangeProposalWrapper(
|
|
1268
|
-
entityUrn=task_urn, aspect=StatusClass(removed=False)
|
|
1269
|
-
)
|
|
1270
|
-
emitter.emit(event)
|
|
1271
|
-
|
|
1272
|
-
if self.config.platform_instance:
|
|
1273
|
-
instance = make_dataplatform_instance_urn(
|
|
1274
|
-
platform="airflow",
|
|
1275
|
-
instance=self.config.platform_instance,
|
|
1276
|
-
)
|
|
1277
|
-
event = MetadataChangeProposalWrapper(
|
|
1278
|
-
entityUrn=str(dataflow.urn),
|
|
1279
|
-
aspect=DataPlatformInstanceClass(
|
|
1280
|
-
platform=make_data_platform_urn("airflow"),
|
|
1281
|
-
instance=instance,
|
|
1282
|
-
),
|
|
1283
|
-
)
|
|
1284
|
-
emitter.emit(event)
|
|
1285
|
-
|
|
1286
|
-
# emit tags
|
|
1287
|
-
for tag in dataflow.tags:
|
|
1288
|
-
tag_urn = builder.make_tag_urn(tag)
|
|
1289
|
-
|
|
1290
|
-
event = MetadataChangeProposalWrapper(
|
|
1291
|
-
entityUrn=tag_urn, aspect=StatusClass(removed=False)
|
|
1292
|
-
)
|
|
1293
|
-
emitter.emit(event)
|
|
1294
|
-
|
|
1295
|
-
browsePaths: List[BrowsePathEntryClass] = []
|
|
1296
|
-
if self.config.platform_instance:
|
|
1297
|
-
urn = make_dataplatform_instance_urn(
|
|
1298
|
-
"airflow", self.config.platform_instance
|
|
1299
|
-
)
|
|
1300
|
-
browsePaths.append(BrowsePathEntryClass(self.config.platform_instance, urn))
|
|
1301
|
-
browsePaths.append(BrowsePathEntryClass(str(dag.dag_id)))
|
|
1302
|
-
browse_path_v2_event: MetadataChangeProposalWrapper = (
|
|
1303
|
-
MetadataChangeProposalWrapper(
|
|
1304
|
-
entityUrn=str(dataflow.urn),
|
|
1305
|
-
aspect=BrowsePathsV2Class(
|
|
1306
|
-
path=browsePaths,
|
|
1307
|
-
),
|
|
1308
|
-
)
|
|
1309
|
-
)
|
|
1310
|
-
logger.debug(
|
|
1311
|
-
f"Emitting BrowsePathsV2 MCP: entityUrn={browse_path_v2_event.entityUrn}, paths={[getattr(p, 'path', str(p)) for p in browsePaths]}"
|
|
1312
|
-
)
|
|
1313
|
-
emitter.emit(browse_path_v2_event)
|
|
1314
|
-
logger.debug(f"Completed emitting all DataFlow MCPs for {dataflow.urn}")
|
|
1315
|
-
|
|
1316
|
-
if dag.dag_id == _DATAHUB_CLEANUP_DAG:
|
|
1317
|
-
assert self.graph
|
|
1318
|
-
|
|
1319
|
-
logger.debug("Initiating the cleanup of obsolete data from datahub")
|
|
1320
|
-
|
|
1321
|
-
# get all ingested dataflow and datajob
|
|
1322
|
-
ingested_dataflow_urns = list(
|
|
1323
|
-
self.graph.get_urns_by_filter(
|
|
1324
|
-
platform="airflow",
|
|
1325
|
-
entity_types=["dataFlow"],
|
|
1326
|
-
platform_instance=self.config.platform_instance,
|
|
1327
|
-
)
|
|
1328
|
-
)
|
|
1329
|
-
ingested_datajob_urns = list(
|
|
1330
|
-
self.graph.get_urns_by_filter(
|
|
1331
|
-
platform="airflow",
|
|
1332
|
-
entity_types=["dataJob"],
|
|
1333
|
-
platform_instance=self.config.platform_instance,
|
|
1334
|
-
)
|
|
1335
|
-
)
|
|
1336
|
-
|
|
1337
|
-
# filter the ingested dataflow and datajob based on the cluster
|
|
1338
|
-
filtered_ingested_dataflow_urns: List = []
|
|
1339
|
-
filtered_ingested_datajob_urns: List = []
|
|
1340
|
-
|
|
1341
|
-
for ingested_dataflow_urn in ingested_dataflow_urns:
|
|
1342
|
-
data_flow_aspect = self.graph.get_aspect(
|
|
1343
|
-
entity_urn=ingested_dataflow_urn, aspect_type=DataFlowKeyClass
|
|
1344
|
-
)
|
|
1345
|
-
if (
|
|
1346
|
-
data_flow_aspect is not None
|
|
1347
|
-
and data_flow_aspect.flowId != _DATAHUB_CLEANUP_DAG
|
|
1348
|
-
and data_flow_aspect is not None
|
|
1349
|
-
and data_flow_aspect.cluster == self.config.cluster
|
|
1350
|
-
):
|
|
1351
|
-
filtered_ingested_dataflow_urns.append(ingested_dataflow_urn)
|
|
1352
|
-
|
|
1353
|
-
for ingested_datajob_urn in ingested_datajob_urns:
|
|
1354
|
-
data_job_aspect = self.graph.get_aspect(
|
|
1355
|
-
entity_urn=ingested_datajob_urn, aspect_type=DataJobKeyClass
|
|
1356
|
-
)
|
|
1357
|
-
if (
|
|
1358
|
-
data_job_aspect is not None
|
|
1359
|
-
and data_job_aspect.flow in filtered_ingested_dataflow_urns
|
|
1360
|
-
):
|
|
1361
|
-
filtered_ingested_datajob_urns.append(ingested_datajob_urn)
|
|
1362
|
-
|
|
1363
|
-
# get all airflow dags
|
|
1364
|
-
all_airflow_dags = SerializedDagModel.read_all_dags().values()
|
|
1365
|
-
|
|
1366
|
-
airflow_flow_urns: List = []
|
|
1367
|
-
airflow_job_urns: List = []
|
|
1368
|
-
|
|
1369
|
-
for dag in all_airflow_dags:
|
|
1370
|
-
flow_urn = builder.make_data_flow_urn(
|
|
1371
|
-
orchestrator="airflow",
|
|
1372
|
-
flow_id=dag.dag_id,
|
|
1373
|
-
cluster=self.config.cluster,
|
|
1374
|
-
platform_instance=self.config.platform_instance,
|
|
1375
|
-
)
|
|
1376
|
-
airflow_flow_urns.append(flow_urn)
|
|
1377
|
-
|
|
1378
|
-
for task in dag.tasks:
|
|
1379
|
-
airflow_job_urns.append(
|
|
1380
|
-
builder.make_data_job_urn_with_flow(str(flow_urn), task.task_id)
|
|
1381
|
-
)
|
|
1382
|
-
|
|
1383
|
-
obsolete_pipelines = set(filtered_ingested_dataflow_urns) - set(
|
|
1384
|
-
airflow_flow_urns
|
|
1385
|
-
)
|
|
1386
|
-
obsolete_tasks = set(filtered_ingested_datajob_urns) - set(airflow_job_urns)
|
|
1387
|
-
|
|
1388
|
-
obsolete_urns = obsolete_pipelines.union(obsolete_tasks)
|
|
1389
|
-
|
|
1390
|
-
asyncio.run(self._soft_delete_obsolete_urns(obsolete_urns=obsolete_urns))
|
|
1391
|
-
|
|
1392
|
-
logger.debug(f"total pipelines removed = {len(obsolete_pipelines)}")
|
|
1393
|
-
logger.debug(f"total tasks removed = {len(obsolete_tasks)}")
|
|
1394
|
-
|
|
1395
|
-
if HAS_AIRFLOW_DAG_LISTENER_API:
|
|
1396
|
-
|
|
1397
|
-
@hookimpl
|
|
1398
|
-
@run_in_thread
|
|
1399
|
-
def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None:
|
|
1400
|
-
logger.debug(
|
|
1401
|
-
f"DataHub on_dag_run_running called for dag_id={dag_run.dag_id}, run_id={dag_run.run_id}, msg={msg}"
|
|
1402
|
-
)
|
|
1403
|
-
if self.check_kill_switch():
|
|
1404
|
-
return
|
|
1405
|
-
|
|
1406
|
-
self._set_log_level()
|
|
1407
|
-
|
|
1408
|
-
logger.debug(
|
|
1409
|
-
f"DataHub listener got notification about dag run start for {dag_run.dag_id}"
|
|
1410
|
-
)
|
|
1411
|
-
|
|
1412
|
-
assert dag_run.dag_id
|
|
1413
|
-
if not self.config.dag_filter_pattern.allowed(dag_run.dag_id):
|
|
1414
|
-
logger.debug(f"DAG {dag_run.dag_id} is not allowed by the pattern")
|
|
1415
|
-
return
|
|
1416
|
-
|
|
1417
|
-
self.on_dag_start(dag_run)
|
|
1418
|
-
emitter = self._get_emitter()
|
|
1419
|
-
if emitter:
|
|
1420
|
-
emitter.flush()
|
|
1421
|
-
|
|
1422
|
-
# TODO: Add hooks for on_dag_run_success, on_dag_run_failed -> call AirflowGenerator.complete_dataflow
|
|
1423
|
-
|
|
1424
|
-
if HAS_AIRFLOW_DATASET_LISTENER_API:
|
|
1425
|
-
|
|
1426
|
-
@hookimpl
|
|
1427
|
-
@run_in_thread
|
|
1428
|
-
def on_dataset_created(self, dataset: "Dataset") -> None: # type: ignore[no-untyped-def]
|
|
1429
|
-
self._set_log_level()
|
|
1430
|
-
|
|
1431
|
-
logger.debug(
|
|
1432
|
-
f"DataHub listener got notification about dataset create for {dataset}"
|
|
1433
|
-
)
|
|
1434
|
-
|
|
1435
|
-
@hookimpl
|
|
1436
|
-
@run_in_thread
|
|
1437
|
-
def on_dataset_changed(self, dataset: "Dataset") -> None: # type: ignore[no-untyped-def]
|
|
1438
|
-
self._set_log_level()
|
|
1439
|
-
|
|
1440
|
-
logger.debug(
|
|
1441
|
-
f"DataHub listener got notification about dataset change for {dataset}"
|
|
1442
|
-
)
|
|
1443
|
-
|
|
1444
|
-
async def _soft_delete_obsolete_urns(self, obsolete_urns):
|
|
1445
|
-
delete_tasks = [self._delete_obsolete_data(urn) for urn in obsolete_urns]
|
|
1446
|
-
await asyncio.gather(*delete_tasks)
|
|
1447
|
-
|
|
1448
|
-
async def _delete_obsolete_data(self, obsolete_urn):
|
|
1449
|
-
assert self.graph
|
|
1450
|
-
|
|
1451
|
-
if self.graph.exists(str(obsolete_urn)):
|
|
1452
|
-
self.graph.soft_delete_entity(str(obsolete_urn))
|