acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1072 @@
1
+ import asyncio
2
+ import copy
3
+ import functools
4
+ import logging
5
+ import os
6
+ import threading
7
+ import time
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ TypeVar,
17
+ cast,
18
+ )
19
+
20
+ import airflow
21
+
22
+ # Import Airflow 2.x specific shims (clean, no cross-version complexity)
23
+ import airflow.version
24
+ import packaging.version
25
+ from airflow.models import Variable
26
+ from airflow.models.serialized_dag import SerializedDagModel
27
+
28
+ # Import Airflow 2.x compatibility and patches before any Airflow imports
29
+ # Wrap in try-except to ensure listener can still load if compatibility module has issues
30
+ try:
31
+ from datahub_airflow_plugin.airflow2 import _airflow_compat # noqa: F401
32
+ except Exception as e:
33
+ # Log but don't fail - compatibility patches are optional
34
+ import logging
35
+
36
+ logger = logging.getLogger(__name__)
37
+ logger.warning(
38
+ f"Could not import Airflow 2.x compatibility module: {e}. Some patches may not be applied."
39
+ )
40
+
41
+ # Conditional import for OpenLineage (may not be installed)
42
+ try:
43
+ from openlineage.client.serde import Serde
44
+
45
+ OPENLINEAGE_AVAILABLE = True
46
+ except ImportError:
47
+ # Not available when openlineage packages aren't installed
48
+ Serde = None # type: ignore[assignment,misc]
49
+ OPENLINEAGE_AVAILABLE = False
50
+
51
+ import datahub.emitter.mce_builder as builder
52
+ from datahub.api.entities.datajob import DataJob
53
+ from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult
54
+ from datahub.emitter.mce_builder import (
55
+ make_data_platform_urn,
56
+ make_dataplatform_instance_urn,
57
+ )
58
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
59
+ from datahub.emitter.rest_emitter import DatahubRestEmitter
60
+ from datahub.ingestion.graph.client import DataHubGraph
61
+ from datahub.metadata.schema_classes import (
62
+ BrowsePathEntryClass,
63
+ BrowsePathsV2Class,
64
+ DataFlowKeyClass,
65
+ DataJobInputOutputClass,
66
+ DataJobKeyClass,
67
+ DataPlatformInstanceClass,
68
+ FineGrainedLineageClass,
69
+ FineGrainedLineageDownstreamTypeClass,
70
+ FineGrainedLineageUpstreamTypeClass,
71
+ OperationClass,
72
+ OperationTypeClass,
73
+ StatusClass,
74
+ )
75
+ from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
76
+ from datahub.telemetry import telemetry
77
+ from datahub_airflow_plugin._config import DatahubLineageConfig, get_lineage_config
78
+ from datahub_airflow_plugin._constants import (
79
+ DATAHUB_SQL_PARSING_RESULT_KEY,
80
+ SQL_PARSING_RESULT_KEY,
81
+ )
82
+ from datahub_airflow_plugin._datahub_ol_adapter import translate_ol_to_datahub_urn
83
+ from datahub_airflow_plugin._version import __package_name__, __version__
84
+ from datahub_airflow_plugin.airflow2._extractors import ExtractorManager
85
+ from datahub_airflow_plugin.airflow2._shims import (
86
+ OpenLineagePlugin,
87
+ Operator,
88
+ TaskHolder,
89
+ get_task_inlets,
90
+ get_task_outlets,
91
+ redact_with_exclusions,
92
+ )
93
+ from datahub_airflow_plugin.client.airflow_generator import ( # type: ignore[attr-defined]
94
+ AirflowGenerator,
95
+ )
96
+ from datahub_airflow_plugin.entities import (
97
+ _Entity,
98
+ entities_to_datajob_urn_list,
99
+ entities_to_dataset_urn_list,
100
+ )
101
+
102
+ # Feature flags for Airflow 2.x
103
+ AIRFLOW_VERSION = packaging.version.parse(airflow.version.version)
104
+ HAS_AIRFLOW_DAG_LISTENER_API: bool = AIRFLOW_VERSION >= packaging.version.parse(
105
+ "2.5.0.dev0"
106
+ )
107
+ HAS_AIRFLOW_DATASET_LISTENER_API: bool = AIRFLOW_VERSION >= packaging.version.parse(
108
+ "2.8.0.dev0"
109
+ )
110
+
111
+ _F = TypeVar("_F", bound=Callable[..., None])
112
+ if TYPE_CHECKING:
113
+ from airflow.datasets import Dataset
114
+ from airflow.models import DAG, DagRun, TaskInstance
115
+
116
+ # To placate mypy on Airflow versions that don't have the listener API,
117
+ # we define a dummy hookimpl that's an identity function.
118
+
119
+ def hookimpl(f: _F) -> _F: # type: ignore[misc]
120
+ return f
121
+
122
+ else:
123
+ from airflow.listeners import hookimpl
124
+
125
+ logger = logging.getLogger(__name__)
126
+
127
+
128
+ def _get_dagrun_from_task_instance(task_instance: "TaskInstance") -> "DagRun":
129
+ """
130
+ Get a DagRun from a TaskInstance (Airflow 2.x).
131
+
132
+ In Airflow 2.x, TaskInstance has a dag_run attribute.
133
+ """
134
+ return task_instance.dag_run # type: ignore[return-value]
135
+
136
+
137
+ _airflow_listener_initialized = False
138
+ _airflow_listener: Optional["DataHubListener"] = None
139
+ _airflow_listener_lock = threading.Lock()
140
+
141
+ # Threading is enabled by default for better performance
142
+ # It prevents slow lineage extraction from blocking task completion
143
+ # Can be disabled by setting DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD=false
144
+ _RUN_IN_THREAD = os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD", "true").lower() in (
145
+ "true",
146
+ "1",
147
+ )
148
+ _RUN_IN_THREAD_TIMEOUT = float(
149
+ os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD_TIMEOUT", 10)
150
+ )
151
+ _DATAHUB_CLEANUP_DAG = "Datahub_Cleanup"
152
+
153
+ KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener"
154
+
155
+
156
+ def get_airflow_plugin_listener() -> Optional["DataHubListener"]:
157
+ """
158
+ Get or initialize the DataHub listener singleton.
159
+
160
+ Uses double-checked locking pattern for thread-safe lazy initialization.
161
+ This prevents race conditions when multiple worker threads try to initialize
162
+ the listener simultaneously.
163
+ """
164
+ global _airflow_listener_initialized
165
+ global _airflow_listener
166
+
167
+ # Fast path: if already initialized, return immediately without acquiring lock
168
+ if _airflow_listener_initialized:
169
+ return _airflow_listener
170
+
171
+ # Slow path: acquire lock for initialization
172
+ with _airflow_listener_lock:
173
+ # Double-check: another thread might have initialized while we waited for lock
174
+ if _airflow_listener_initialized:
175
+ return _airflow_listener
176
+
177
+ # Now safe to initialize - we hold the lock and confirmed not initialized
178
+ _airflow_listener_initialized = True
179
+
180
+ plugin_config = get_lineage_config()
181
+
182
+ if plugin_config.enabled:
183
+ _airflow_listener = DataHubListener(config=plugin_config)
184
+ logger.info(
185
+ f"DataHub plugin v2 (package: {__package_name__} and version: {__version__}) listener initialized with config: {plugin_config}"
186
+ )
187
+
188
+ telemetry.telemetry_instance.ping(
189
+ "airflow-plugin-init",
190
+ {
191
+ "airflow-version": airflow.__version__,
192
+ "datahub-airflow-plugin": "v2",
193
+ "datahub-airflow-plugin-dag-events": HAS_AIRFLOW_DAG_LISTENER_API,
194
+ "capture_executions": plugin_config.capture_executions,
195
+ "capture_tags": plugin_config.capture_tags_info,
196
+ "capture_ownership": plugin_config.capture_ownership_info,
197
+ "enable_extractors": plugin_config.enable_extractors,
198
+ "render_templates": plugin_config.render_templates,
199
+ "disable_openlineage_plugin": plugin_config.disable_openlineage_plugin,
200
+ },
201
+ )
202
+
203
+ # Debug: Log OpenLineage plugin state
204
+ if OpenLineagePlugin is not None:
205
+ logger.info(
206
+ f"OpenLineage plugin state: listeners={len(getattr(OpenLineagePlugin, 'listeners', []))} items, "
207
+ f"disable_openlineage_plugin={plugin_config.disable_openlineage_plugin}"
208
+ )
209
+
210
+ if plugin_config.disable_openlineage_plugin and OpenLineagePlugin is not None:
211
+ # Deactivate the OpenLineagePlugin listener to avoid conflicts/errors.
212
+ OpenLineagePlugin.listeners = []
213
+ logger.info("Cleared OpenLineage plugin listeners")
214
+
215
+ return _airflow_listener
216
+
217
+
218
+ def run_in_thread(f: _F) -> _F:
219
+ # This is also responsible for catching exceptions and logging them.
220
+
221
+ @functools.wraps(f)
222
+ def wrapper(*args, **kwargs):
223
+ def safe_target():
224
+ """
225
+ Wrapper for the thread target that catches and logs exceptions.
226
+
227
+ Without this, exceptions raised inside the thread would be silently
228
+ lost, making debugging production issues nearly impossible.
229
+ """
230
+ try:
231
+ f(*args, **kwargs)
232
+ except Exception as e:
233
+ logger.error(
234
+ f"Error in thread executing {f.__name__}: {e}",
235
+ exc_info=True,
236
+ )
237
+
238
+ try:
239
+ if _RUN_IN_THREAD:
240
+ # A poor-man's timeout mechanism.
241
+ # This ensures that we don't hang the task if the extractors
242
+ # are slow or the DataHub API is slow to respond.
243
+
244
+ thread = threading.Thread(target=safe_target, daemon=True)
245
+ thread.start()
246
+
247
+ if _RUN_IN_THREAD_TIMEOUT > 0:
248
+ # If _RUN_IN_THREAD_TIMEOUT is 0, we just kick off the thread and move on.
249
+ # Because it's a daemon thread, it'll be automatically killed when the main
250
+ # thread exists.
251
+
252
+ start_time = time.time()
253
+ thread.join(timeout=_RUN_IN_THREAD_TIMEOUT)
254
+ if thread.is_alive():
255
+ logger.warning(
256
+ f"Thread for {f.__name__} is still running after {_RUN_IN_THREAD_TIMEOUT} seconds. "
257
+ "Continuing without waiting for it to finish."
258
+ )
259
+ else:
260
+ logger.debug(
261
+ f"Thread for {f.__name__} finished after {time.time() - start_time} seconds"
262
+ )
263
+ else:
264
+ f(*args, **kwargs)
265
+ except Exception as e:
266
+ logger.warning(
267
+ f"Error setting up thread for {f.__name__}: {e}",
268
+ exc_info=True,
269
+ )
270
+
271
+ return cast(_F, wrapper)
272
+
273
+
274
+ def _render_templates(task_instance: "TaskInstance") -> "TaskInstance":
275
+ # Render templates in a copy of the task instance.
276
+ # This is necessary to get the correct operator args in the extractors.
277
+
278
+ try:
279
+ task_instance_copy = copy.deepcopy(task_instance)
280
+ task_instance_copy.render_templates()
281
+ return task_instance_copy
282
+ except Exception as e:
283
+ logger.info(
284
+ f"Error rendering templates in DataHub listener. Jinja-templated variables will not be extracted correctly: {e}. Template rendering improves SQL parsing accuracy. If this causes issues, you can disable it by setting `render_templates` to `false` in the DataHub plugin configuration."
285
+ )
286
+ return task_instance
287
+
288
+
289
+ class DataHubListener:
290
+ __name__ = "DataHubListener"
291
+
292
+ def __init__(self, config: DatahubLineageConfig):
293
+ self.config = config
294
+ self._set_log_level()
295
+
296
+ self._emitter = config.make_emitter_hook().make_emitter()
297
+ self._graph: Optional[DataHubGraph] = None
298
+ logger.info(f"DataHub plugin v2 using {repr(self._emitter)}")
299
+
300
+ # See discussion here https://github.com/OpenLineage/OpenLineage/pull/508 for
301
+ # why we need to keep track of tasks ourselves.
302
+ # Note: TaskHolder is only available in legacy openlineage-airflow package,
303
+ # not in apache-airflow-providers-openlineage (where task_instance.task is directly available)
304
+ self._task_holder: Any = TaskHolder() if TaskHolder is not None else None
305
+
306
+ # In our case, we also want to cache the initial datajob object
307
+ # so that we can add to it when the task completes.
308
+ self._datajob_holder: Dict[str, DataJob] = {}
309
+
310
+ # Create extractor_manager for Airflow 2.x with patch/extractor configuration
311
+ self.extractor_manager = ExtractorManager(
312
+ patch_sql_parser=self.config.patch_sql_parser,
313
+ patch_snowflake_schema=self.config.patch_snowflake_schema,
314
+ extract_athena_operator=self.config.extract_athena_operator,
315
+ extract_bigquery_insert_job_operator=self.config.extract_bigquery_insert_job_operator,
316
+ extract_teradata_operator=self.config.extract_teradata_operator,
317
+ )
318
+
319
+ # This "inherits" from types.ModuleType to avoid issues with Airflow's listener plugin loader.
320
+ # It previously (v2.4.x and likely other versions too) would throw errors if it was not a module.
321
+ # https://github.com/apache/airflow/blob/e99a518970b2d349a75b1647f6b738c8510fa40e/airflow/listeners/listener.py#L56
322
+ # self.__class__ = types.ModuleType
323
+
324
+ @property
325
+ def emitter(self):
326
+ return self._emitter
327
+
328
+ @property
329
+ def graph(self) -> Optional[DataHubGraph]:
330
+ if self._graph:
331
+ return self._graph
332
+
333
+ if isinstance(self._emitter, DatahubRestEmitter) and not isinstance(
334
+ self._emitter, DataHubGraph
335
+ ):
336
+ # This is lazy initialized to avoid throwing errors on plugin load.
337
+ self._graph = self._emitter.to_graph()
338
+ self._emitter = self._graph
339
+
340
+ return self._graph
341
+
342
+ def _set_log_level(self) -> None:
343
+ """Set the log level for the plugin and its dependencies.
344
+
345
+ This may need to be called multiple times, since Airflow sometimes
346
+ messes with the logging configuration after the plugin is loaded.
347
+ In particular, the loggers may get changed when the worker starts
348
+ executing a task.
349
+ """
350
+
351
+ if self.config.log_level:
352
+ logging.getLogger(__name__.split(".")[0]).setLevel(self.config.log_level)
353
+ if self.config.debug_emitter:
354
+ logging.getLogger("datahub.emitter").setLevel(logging.DEBUG)
355
+
356
+ def _make_emit_callback(self) -> Callable[[Optional[Exception], str], None]:
357
+ def emit_callback(err: Optional[Exception], msg: str) -> None:
358
+ if err:
359
+ logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err)
360
+
361
+ return emit_callback
362
+
363
+ def _extract_lineage_from_airflow2(
364
+ self,
365
+ datajob: DataJob,
366
+ dagrun: "DagRun",
367
+ task: "Operator",
368
+ task_instance: "TaskInstance",
369
+ complete: bool,
370
+ ) -> Tuple[List[str], List[str], Optional[SqlParsingResult], Optional[Any]]:
371
+ """Extract lineage using Airflow 2.x extractor system."""
372
+ input_urns: List[str] = []
373
+ output_urns: List[str] = []
374
+
375
+ task_metadata = self.extractor_manager.extract_metadata( # type: ignore[union-attr]
376
+ dagrun,
377
+ task,
378
+ complete=complete,
379
+ task_instance=task_instance,
380
+ task_uuid=str(datajob.urn),
381
+ graph=self.graph,
382
+ )
383
+ logger.debug(f"Got task metadata: {task_metadata}")
384
+
385
+ # Translate task_metadata.inputs/outputs to DataHub URNs.
386
+ input_urns.extend(
387
+ translate_ol_to_datahub_urn(dataset) for dataset in task_metadata.inputs
388
+ )
389
+ output_urns.extend(
390
+ translate_ol_to_datahub_urn(dataset) for dataset in task_metadata.outputs
391
+ )
392
+
393
+ # Extract and remove DataHub's custom SQL parsing result from run_facets.
394
+ # We use .pop() (not .get()) to remove the key so that when task_metadata.run_facets
395
+ # are serialized as OpenLineage facets later, they don't include DataHub-specific
396
+ # additions. This keeps the OpenLineage facets clean and standards-compliant.
397
+ sql_parsing_result = task_metadata.run_facets.pop(SQL_PARSING_RESULT_KEY, None)
398
+ # Also check for DATAHUB_SQL_PARSING_RESULT_KEY (used by provider mode patches)
399
+ if DATAHUB_SQL_PARSING_RESULT_KEY in task_metadata.run_facets:
400
+ if sql_parsing_result is None:
401
+ sql_parsing_result = task_metadata.run_facets.pop(
402
+ DATAHUB_SQL_PARSING_RESULT_KEY, None
403
+ )
404
+ else:
405
+ # If both keys exist, prefer DATAHUB_SQL_PARSING_RESULT_KEY and remove the other
406
+ task_metadata.run_facets.pop(DATAHUB_SQL_PARSING_RESULT_KEY, None)
407
+
408
+ return input_urns, output_urns, sql_parsing_result, task_metadata
409
+
410
+ def _process_sql_parsing_result(
411
+ self,
412
+ datajob: DataJob,
413
+ sql_parsing_result: Optional[SqlParsingResult],
414
+ ) -> Tuple[List[str], List[str], List[FineGrainedLineageClass]]:
415
+ """Process SQL parsing result and return additional URNs and column lineage."""
416
+ input_urns: List[str] = []
417
+ output_urns: List[str] = []
418
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
419
+
420
+ if not sql_parsing_result:
421
+ return input_urns, output_urns, fine_grained_lineages
422
+
423
+ if error := sql_parsing_result.debug_info.error:
424
+ logger.info(f"SQL parsing error: {error}", exc_info=error)
425
+ datajob.properties["datahub_sql_parser_error"] = (
426
+ f"{type(error).__name__}: {error}"
427
+ )
428
+
429
+ if not sql_parsing_result.debug_info.table_error:
430
+ input_urns.extend(sql_parsing_result.in_tables)
431
+ output_urns.extend(sql_parsing_result.out_tables)
432
+
433
+ if sql_parsing_result.column_lineage:
434
+ fine_grained_lineages.extend(
435
+ FineGrainedLineageClass(
436
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
437
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
438
+ upstreams=[
439
+ builder.make_schema_field_urn(
440
+ upstream.table, upstream.column
441
+ )
442
+ for upstream in column_lineage.upstreams
443
+ ],
444
+ downstreams=[
445
+ builder.make_schema_field_urn(
446
+ downstream.table, downstream.column
447
+ )
448
+ for downstream in [column_lineage.downstream]
449
+ if downstream.table
450
+ ],
451
+ )
452
+ for column_lineage in sql_parsing_result.column_lineage
453
+ )
454
+
455
+ return input_urns, output_urns, fine_grained_lineages
456
+
457
+ def _extract_lineage(
458
+ self,
459
+ datajob: DataJob,
460
+ dagrun: "DagRun",
461
+ task: "Operator",
462
+ task_instance: "TaskInstance",
463
+ complete: bool = False,
464
+ ) -> None:
465
+ """
466
+ Combine lineage (including column lineage) from task inlets/outlets and
467
+ extractor-generated task_metadata and write it to the datajob. This
468
+ routine is also responsible for converting the lineage to DataHub URNs.
469
+ """
470
+ if not self.config.enable_datajob_lineage:
471
+ return
472
+
473
+ input_urns: List[str] = []
474
+ output_urns: List[str] = []
475
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
476
+
477
+ task_metadata = None
478
+ sql_parsing_result: Optional[SqlParsingResult] = None
479
+
480
+ # Extract lineage using Airflow 2.x extractors
481
+ if self.config.enable_extractors:
482
+ (
483
+ extracted_input_urns,
484
+ extracted_output_urns,
485
+ sql_parsing_result,
486
+ task_metadata,
487
+ ) = self._extract_lineage_from_airflow2(
488
+ datajob, dagrun, task, task_instance, complete
489
+ )
490
+ input_urns.extend(extracted_input_urns)
491
+ output_urns.extend(extracted_output_urns)
492
+
493
+ # Process SQL parsing result
494
+ sql_input_urns, sql_output_urns, sql_fine_grained_lineages = (
495
+ self._process_sql_parsing_result(datajob, sql_parsing_result)
496
+ )
497
+ input_urns.extend(sql_input_urns)
498
+ output_urns.extend(sql_output_urns)
499
+ fine_grained_lineages.extend(sql_fine_grained_lineages)
500
+
501
+ # Add DataHub-native inlets/outlets
502
+ input_urns.extend(
503
+ iolet.urn for iolet in get_task_inlets(task) if isinstance(iolet, _Entity)
504
+ )
505
+ output_urns.extend(
506
+ iolet.urn for iolet in get_task_outlets(task) if isinstance(iolet, _Entity)
507
+ )
508
+
509
+ # Write the lineage to the datajob object
510
+ datajob.inlets.extend(entities_to_dataset_urn_list(input_urns))
511
+ datajob.outlets.extend(entities_to_dataset_urn_list(output_urns))
512
+ datajob.upstream_urns.extend(entities_to_datajob_urn_list(input_urns))
513
+ datajob.fine_grained_lineages.extend(fine_grained_lineages)
514
+
515
+ # Merge with datajob from task start (if this is task completion)
516
+ if complete:
517
+ original_datajob = self._datajob_holder.get(str(datajob.urn), None)
518
+ else:
519
+ self._datajob_holder[str(datajob.urn)] = datajob
520
+ original_datajob = None
521
+
522
+ if original_datajob:
523
+ logger.debug("Merging start datajob into finish datajob")
524
+ datajob.inlets.extend(original_datajob.inlets)
525
+ datajob.outlets.extend(original_datajob.outlets)
526
+ datajob.upstream_urns.extend(original_datajob.upstream_urns)
527
+ datajob.fine_grained_lineages.extend(original_datajob.fine_grained_lineages)
528
+
529
+ for k, v in original_datajob.properties.items():
530
+ datajob.properties.setdefault(k, v)
531
+
532
+ # Deduplicate inlets/outlets
533
+ datajob.inlets = list(sorted(set(datajob.inlets), key=lambda x: str(x)))
534
+ datajob.outlets = list(sorted(set(datajob.outlets), key=lambda x: str(x)))
535
+ datajob.upstream_urns = list(
536
+ sorted(set(datajob.upstream_urns), key=lambda x: str(x))
537
+ )
538
+
539
+ # Write all other OL facets as DataHub properties
540
+ if task_metadata and Serde is not None:
541
+ for k, v in task_metadata.job_facets.items():
542
+ # Redaction is only available with legacy openlineage-airflow package
543
+ value_to_serialize = (
544
+ redact_with_exclusions(v)
545
+ if redact_with_exclusions is not None
546
+ else v
547
+ ) # type: ignore[arg-type]
548
+ datajob.properties[f"openlineage_job_facet_{k}"] = Serde.to_json(
549
+ value_to_serialize
550
+ )
551
+
552
+ for k, v in task_metadata.run_facets.items():
553
+ # Skip DataHub-specific keys that can't be serialized by OpenLineage's Serde
554
+ # These are SqlParsingResult objects, not attrs-decorated classes
555
+ if k in (SQL_PARSING_RESULT_KEY, DATAHUB_SQL_PARSING_RESULT_KEY):
556
+ logger.debug(
557
+ f"Skipping serialization of DataHub-specific run_facet key: {k}"
558
+ )
559
+ continue
560
+ # Redaction is only available with legacy openlineage-airflow package
561
+ value_to_serialize = (
562
+ redact_with_exclusions(v)
563
+ if redact_with_exclusions is not None
564
+ else v
565
+ ) # type: ignore[arg-type]
566
+ datajob.properties[f"openlineage_run_facet_{k}"] = Serde.to_json(
567
+ value_to_serialize
568
+ )
569
+
570
+ def check_kill_switch(self) -> bool:
571
+ # For Airflow 2.x, use Variable.get()
572
+ try:
573
+ if Variable.get(KILL_SWITCH_VARIABLE_NAME, "false").lower() == "true":
574
+ logger.debug("DataHub listener disabled by kill switch")
575
+ return True
576
+ except Exception as e:
577
+ logger.debug(f"Error checking kill switch variable: {e}")
578
+ return False
579
+
580
+ def _prepare_task_context(
581
+ self, task_instance: "TaskInstance", for_completion: bool = False
582
+ ) -> Optional[Tuple["DagRun", "Operator", "DAG"]]:
583
+ """
584
+ Prepare task context by extracting DAG run, task, and DAG from task instance.
585
+
586
+ Args:
587
+ task_instance: The Airflow task instance
588
+ for_completion: If True, retrieves task from holder for completion events
589
+
590
+ Returns:
591
+ Tuple of (dagrun, task, dag) or None if context cannot be prepared
592
+ """
593
+ # Get dagrun in a version-compatible way (Airflow 2.x vs 3.x)
594
+ dagrun: "DagRun" = _get_dagrun_from_task_instance(task_instance)
595
+
596
+ if self.config.render_templates:
597
+ task_instance = _render_templates(task_instance)
598
+
599
+ # Get task - different logic for start vs completion events
600
+ if for_completion:
601
+ # For completion: prefer task attribute, fallback to holder
602
+ if getattr(task_instance, "task", None):
603
+ task = task_instance.task
604
+ elif hasattr(self._task_holder, "get_task"):
605
+ task = self._task_holder.get_task(task_instance)
606
+ else:
607
+ task = None
608
+ else:
609
+ # For start: task should be directly available
610
+ task = task_instance.task
611
+
612
+ if task is None:
613
+ return None
614
+
615
+ dag: "DAG" = task.dag # type: ignore[assignment]
616
+
617
+ # Check if DAG is allowed by filter pattern
618
+ if not self.config.dag_filter_pattern.allowed(dag.dag_id):
619
+ logger.debug(f"DAG {dag.dag_id} is not allowed by the pattern")
620
+ return None
621
+
622
+ # Task type can vary between Airflow versions (MappedOperator, SerializedBaseOperator, etc.)
623
+ return dagrun, task, dag # type: ignore[return-value]
624
+
625
+ def _generate_and_emit_datajob(
626
+ self,
627
+ dagrun: "DagRun",
628
+ task: "Operator",
629
+ dag: "DAG",
630
+ task_instance: "TaskInstance",
631
+ complete: bool = False,
632
+ ) -> DataJob:
633
+ """
634
+ Generate DataJob with lineage and emit it to DataHub.
635
+
636
+ Args:
637
+ dagrun: The DAG run
638
+ task: The task operator
639
+ dag: The DAG
640
+ task_instance: The task instance
641
+ complete: Whether this is for task completion
642
+
643
+ Returns:
644
+ The generated DataJob
645
+ """
646
+ datajob = AirflowGenerator.generate_datajob(
647
+ cluster=self.config.cluster,
648
+ task=task, # type: ignore[arg-type]
649
+ dag=dag,
650
+ capture_tags=self.config.capture_tags_info,
651
+ capture_owner=self.config.capture_ownership_info,
652
+ config=self.config,
653
+ )
654
+
655
+ # Add lineage info
656
+ self._extract_lineage(datajob, dagrun, task, task_instance, complete=complete) # type: ignore[arg-type]
657
+
658
+ # Emit DataJob MCPs
659
+ # Skip dataJobInputOutput aspects on task start to avoid file emitter merging duplicates
660
+ # The file emitter merges aspects with the same entity URN and aspect name,
661
+ # which causes FGLs from start and completion to be combined into duplicates.
662
+ # We only emit the aspect on completion when lineage is complete and accurate.
663
+ for mcp in datajob.generate_mcp(
664
+ generate_lineage=self.config.enable_datajob_lineage,
665
+ materialize_iolets=self.config.materialize_iolets,
666
+ ):
667
+ # Skip dataJobInputOutput aspects on task start
668
+ if not complete:
669
+ if isinstance(mcp.aspect, DataJobInputOutputClass):
670
+ logger.debug(
671
+ f"Skipping dataJobInputOutput for task {task.task_id} on start "
672
+ f"(will be emitted on completion to avoid file emitter merging duplicates)"
673
+ )
674
+ continue
675
+
676
+ self.emitter.emit(mcp, self._make_emit_callback())
677
+
678
+ status_text = f"finish w/ status {complete}" if complete else "start"
679
+ logger.debug(f"Emitted DataHub Datajob {status_text}: {datajob}")
680
+
681
+ return datajob
682
+
683
+ @hookimpl
684
+ @run_in_thread
685
+ def on_task_instance_running( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
686
+ self, previous_state, task_instance: "TaskInstance", **kwargs
687
+ ) -> None:
688
+ # In Airflow 3.0, the session parameter was removed from the hook signature
689
+ if self.check_kill_switch():
690
+ return
691
+ self._set_log_level()
692
+
693
+ # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508.
694
+ if not hasattr(task_instance, "task"):
695
+ logger.warning(
696
+ f"No task set for task_id: {task_instance.task_id} - " # type: ignore[attr-defined]
697
+ f"dag_id: {task_instance.dag_id} - run_id {task_instance.run_id}" # type: ignore[attr-defined]
698
+ )
699
+ return
700
+
701
+ logger.debug(
702
+ f"DataHub listener got notification about task instance start for {task_instance.task_id} of dag {task_instance.dag_id}"
703
+ )
704
+
705
+ # Check if DAG is allowed before doing any expensive operations
706
+ if not self.config.dag_filter_pattern.allowed(task_instance.dag_id):
707
+ logger.debug(f"DAG {task_instance.dag_id} is not allowed by the pattern")
708
+ return
709
+
710
+ # Handle async operators in Airflow 2.3 by skipping deferred state.
711
+ # Inspired by https://github.com/OpenLineage/OpenLineage/pull/1601
712
+ if (
713
+ hasattr(task_instance, "next_method")
714
+ and task_instance.next_method is not None
715
+ ):
716
+ return
717
+
718
+ # Render templates and extract context
719
+ if self.config.render_templates:
720
+ task_instance = _render_templates(task_instance)
721
+
722
+ dagrun: "DagRun" = _get_dagrun_from_task_instance(task_instance)
723
+ task = task_instance.task
724
+ assert task is not None
725
+ dag: "DAG" = task.dag # type: ignore[assignment]
726
+
727
+ # Store task for later retrieval (only available with legacy openlineage-airflow package)
728
+ if self._task_holder is not None:
729
+ self._task_holder.set_task(task_instance)
730
+
731
+ # If we don't have the DAG listener API, emit DAG start event
732
+ if not HAS_AIRFLOW_DAG_LISTENER_API:
733
+ self.on_dag_start(dagrun)
734
+
735
+ # Generate and emit datajob
736
+ # Task type can vary between Airflow versions (MappedOperator from different modules)
737
+ datajob = self._generate_and_emit_datajob(
738
+ dagrun,
739
+ task, # type: ignore[arg-type]
740
+ dag,
741
+ task_instance,
742
+ complete=False,
743
+ )
744
+
745
+ # Emit process instance if capturing executions
746
+ if self.config.capture_executions:
747
+ dpi = AirflowGenerator.run_datajob(
748
+ emitter=self.emitter,
749
+ config=self.config,
750
+ ti=task_instance,
751
+ dag=dag,
752
+ dag_run=dagrun,
753
+ datajob=datajob,
754
+ emit_templates=False,
755
+ )
756
+ logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}")
757
+
758
+ self.emitter.flush()
759
+
760
+ logger.debug(
761
+ f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}"
762
+ )
763
+
764
+ self.materialize_iolets(datajob)
765
+
766
+ def materialize_iolets(self, datajob: DataJob) -> None:
767
+ if self.config.materialize_iolets:
768
+ for outlet in datajob.outlets:
769
+ reported_time: int = int(time.time() * 1000)
770
+ operation = OperationClass(
771
+ timestampMillis=reported_time,
772
+ operationType=OperationTypeClass.CREATE,
773
+ lastUpdatedTimestamp=reported_time,
774
+ actor=builder.make_user_urn("airflow"),
775
+ )
776
+
777
+ operation_mcp = MetadataChangeProposalWrapper(
778
+ entityUrn=str(outlet), aspect=operation
779
+ )
780
+
781
+ self.emitter.emit(operation_mcp)
782
+ logger.debug(f"Emitted Dataset Operation: {outlet}")
783
+ else:
784
+ if self.graph:
785
+ for outlet in datajob.outlets:
786
+ if not self.graph.exists(str(outlet)):
787
+ logger.warning(f"Dataset {str(outlet)} not materialized")
788
+ for inlet in datajob.inlets:
789
+ if not self.graph.exists(str(inlet)):
790
+ logger.warning(f"Dataset {str(inlet)} not materialized")
791
+
792
+ def on_task_instance_finish(
793
+ self, task_instance: "TaskInstance", status: InstanceRunResult
794
+ ) -> None:
795
+ # Prepare task context (handles template rendering, task retrieval, DAG filtering)
796
+ context = self._prepare_task_context(task_instance, for_completion=True)
797
+ if context is None:
798
+ return
799
+
800
+ dagrun, task, dag = context
801
+
802
+ # Generate and emit datajob with lineage
803
+ datajob = self._generate_and_emit_datajob(
804
+ dagrun, task, dag, task_instance, complete=True
805
+ )
806
+
807
+ # Emit process instance if capturing executions
808
+ if self.config.capture_executions:
809
+ dpi = AirflowGenerator.complete_datajob(
810
+ emitter=self.emitter,
811
+ cluster=self.config.cluster,
812
+ ti=task_instance,
813
+ dag=dag,
814
+ dag_run=dagrun,
815
+ datajob=datajob,
816
+ result=status,
817
+ config=self.config,
818
+ )
819
+ logger.debug(
820
+ f"Emitted DataHub DataProcess Instance with status {status}: {dpi}"
821
+ )
822
+ # Emit inlet/outlet aspects for DataProcessInstance (emit_process_end only emits run event)
823
+ # This matches the behavior of emit_process_start which calls generate_mcp()
824
+ for mcp in dpi.generate_inlet_outlet_mcp(materialize_iolets=False):
825
+ self.emitter.emit(mcp, self._make_emit_callback())
826
+
827
+ # Materialize iolets on completion (outlets may be populated during execution)
828
+ # This ensures operation aspects are emitted for datasets created during task execution
829
+ self.materialize_iolets(datajob)
830
+
831
+ self.emitter.flush()
832
+
833
+ @hookimpl
834
+ @run_in_thread
835
+ def on_task_instance_success( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
836
+ self, previous_state, task_instance: "TaskInstance", **kwargs
837
+ ) -> None:
838
+ if self.check_kill_switch():
839
+ return
840
+
841
+ self._set_log_level()
842
+
843
+ logger.debug(
844
+ f"DataHub listener got notification about task instance success for {task_instance.task_id}"
845
+ )
846
+ self.on_task_instance_finish(task_instance, status=InstanceRunResult.SUCCESS)
847
+ logger.debug(
848
+ f"DataHub listener finished processing task instance success for {task_instance.task_id}"
849
+ )
850
+
851
+ @hookimpl
852
+ @run_in_thread
853
+ def on_task_instance_failed( # type: ignore[no-untyped-def] # Airflow 3.0 removed previous_state parameter
854
+ self, previous_state, task_instance: "TaskInstance", **kwargs
855
+ ) -> None:
856
+ if self.check_kill_switch():
857
+ return
858
+
859
+ self._set_log_level()
860
+
861
+ logger.debug(
862
+ f"DataHub listener got notification about task instance failure for {task_instance.task_id}"
863
+ )
864
+
865
+ # TODO: Handle UP_FOR_RETRY state.
866
+ # TODO: Use the error parameter (available in kwargs for Airflow 3.0+) for better error reporting
867
+ self.on_task_instance_finish(task_instance, status=InstanceRunResult.FAILURE)
868
+ logger.debug(
869
+ f"DataHub listener finished processing task instance failure for {task_instance.task_id}"
870
+ )
871
+
872
+ def on_dag_start(self, dag_run: "DagRun") -> None: # type: ignore[no-untyped-def]
873
+ dag = dag_run.dag
874
+ if not dag:
875
+ logger.warning(
876
+ f"DataHub listener could not find DAG for {dag_run.dag_id} - {dag_run.run_id}. Dag won't be captured"
877
+ )
878
+ return
879
+
880
+ dataflow = AirflowGenerator.generate_dataflow(
881
+ config=self.config,
882
+ dag=dag, # type: ignore[arg-type]
883
+ )
884
+ dataflow.emit(self.emitter, callback=self._make_emit_callback())
885
+ logger.debug(f"Emitted DataHub DataFlow: {dataflow}")
886
+
887
+ event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
888
+ entityUrn=str(dataflow.urn), aspect=StatusClass(removed=False)
889
+ )
890
+ self.emitter.emit(event)
891
+
892
+ for task in dag.tasks:
893
+ task_urn = builder.make_data_job_urn_with_flow(
894
+ str(dataflow.urn), task.task_id
895
+ )
896
+ event = MetadataChangeProposalWrapper(
897
+ entityUrn=task_urn, aspect=StatusClass(removed=False)
898
+ )
899
+ self.emitter.emit(event)
900
+
901
+ if self.config.platform_instance:
902
+ instance = make_dataplatform_instance_urn(
903
+ platform="airflow",
904
+ instance=self.config.platform_instance,
905
+ )
906
+ event = MetadataChangeProposalWrapper(
907
+ entityUrn=str(dataflow.urn),
908
+ aspect=DataPlatformInstanceClass(
909
+ platform=make_data_platform_urn("airflow"),
910
+ instance=instance,
911
+ ),
912
+ )
913
+ self.emitter.emit(event)
914
+
915
+ # emit tags
916
+ for tag in dataflow.tags:
917
+ tag_urn = builder.make_tag_urn(tag)
918
+
919
+ event = MetadataChangeProposalWrapper(
920
+ entityUrn=tag_urn, aspect=StatusClass(removed=False)
921
+ )
922
+ self.emitter.emit(event)
923
+
924
+ browsePaths: List[BrowsePathEntryClass] = []
925
+ if self.config.platform_instance:
926
+ urn = make_dataplatform_instance_urn(
927
+ "airflow", self.config.platform_instance
928
+ )
929
+ browsePaths.append(BrowsePathEntryClass(self.config.platform_instance, urn))
930
+ browsePaths.append(BrowsePathEntryClass(str(dag.dag_id)))
931
+ browse_path_v2_event: MetadataChangeProposalWrapper = (
932
+ MetadataChangeProposalWrapper(
933
+ entityUrn=str(dataflow.urn),
934
+ aspect=BrowsePathsV2Class(
935
+ path=browsePaths,
936
+ ),
937
+ )
938
+ )
939
+ self.emitter.emit(browse_path_v2_event)
940
+
941
+ if dag.dag_id == _DATAHUB_CLEANUP_DAG:
942
+ assert self.graph
943
+
944
+ logger.debug("Initiating the cleanup of obsolete data from datahub")
945
+
946
+ # get all ingested dataflow and datajob
947
+ ingested_dataflow_urns = list(
948
+ self.graph.get_urns_by_filter(
949
+ platform="airflow",
950
+ entity_types=["dataFlow"],
951
+ platform_instance=self.config.platform_instance,
952
+ )
953
+ )
954
+ ingested_datajob_urns = list(
955
+ self.graph.get_urns_by_filter(
956
+ platform="airflow",
957
+ entity_types=["dataJob"],
958
+ platform_instance=self.config.platform_instance,
959
+ )
960
+ )
961
+
962
+ # filter the ingested dataflow and datajob based on the cluster
963
+ filtered_ingested_dataflow_urns: List = []
964
+ filtered_ingested_datajob_urns: List = []
965
+
966
+ for ingested_dataflow_urn in ingested_dataflow_urns:
967
+ data_flow_aspect = self.graph.get_aspect(
968
+ entity_urn=ingested_dataflow_urn, aspect_type=DataFlowKeyClass
969
+ )
970
+ if (
971
+ data_flow_aspect is not None
972
+ and data_flow_aspect.flowId != _DATAHUB_CLEANUP_DAG
973
+ and data_flow_aspect is not None
974
+ and data_flow_aspect.cluster == self.config.cluster
975
+ ):
976
+ filtered_ingested_dataflow_urns.append(ingested_dataflow_urn)
977
+
978
+ for ingested_datajob_urn in ingested_datajob_urns:
979
+ data_job_aspect = self.graph.get_aspect(
980
+ entity_urn=ingested_datajob_urn, aspect_type=DataJobKeyClass
981
+ )
982
+ if (
983
+ data_job_aspect is not None
984
+ and data_job_aspect.flow in filtered_ingested_dataflow_urns
985
+ ):
986
+ filtered_ingested_datajob_urns.append(ingested_datajob_urn)
987
+
988
+ # get all airflow dags
989
+ all_airflow_dags = SerializedDagModel.read_all_dags().values()
990
+
991
+ airflow_flow_urns: List = []
992
+ airflow_job_urns: List = []
993
+
994
+ for dag in all_airflow_dags:
995
+ flow_urn = builder.make_data_flow_urn(
996
+ orchestrator="airflow",
997
+ flow_id=dag.dag_id,
998
+ cluster=self.config.cluster,
999
+ platform_instance=self.config.platform_instance,
1000
+ )
1001
+ airflow_flow_urns.append(flow_urn)
1002
+
1003
+ for task in dag.tasks:
1004
+ airflow_job_urns.append(
1005
+ builder.make_data_job_urn_with_flow(str(flow_urn), task.task_id)
1006
+ )
1007
+
1008
+ obsolete_pipelines = set(filtered_ingested_dataflow_urns) - set(
1009
+ airflow_flow_urns
1010
+ )
1011
+ obsolete_tasks = set(filtered_ingested_datajob_urns) - set(airflow_job_urns)
1012
+
1013
+ obsolete_urns = obsolete_pipelines.union(obsolete_tasks)
1014
+
1015
+ asyncio.run(self._soft_delete_obsolete_urns(obsolete_urns=obsolete_urns))
1016
+
1017
+ logger.debug(f"total pipelines removed = {len(obsolete_pipelines)}")
1018
+ logger.debug(f"total tasks removed = {len(obsolete_tasks)}")
1019
+
1020
+ if HAS_AIRFLOW_DAG_LISTENER_API:
1021
+
1022
+ @hookimpl
1023
+ @run_in_thread
1024
+ def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None:
1025
+ if self.check_kill_switch():
1026
+ return
1027
+
1028
+ self._set_log_level()
1029
+
1030
+ logger.debug(
1031
+ f"DataHub listener got notification about dag run start for {dag_run.dag_id}"
1032
+ )
1033
+
1034
+ assert dag_run.dag_id
1035
+ if not self.config.dag_filter_pattern.allowed(dag_run.dag_id):
1036
+ logger.debug(f"DAG {dag_run.dag_id} is not allowed by the pattern")
1037
+ return
1038
+
1039
+ self.on_dag_start(dag_run)
1040
+ self.emitter.flush()
1041
+
1042
+ # TODO: Add hooks for on_dag_run_success, on_dag_run_failed -> call AirflowGenerator.complete_dataflow
1043
+
1044
+ if HAS_AIRFLOW_DATASET_LISTENER_API:
1045
+
1046
+ @hookimpl
1047
+ @run_in_thread
1048
+ def on_dataset_created(self, dataset: "Dataset") -> None: # type: ignore[no-untyped-def]
1049
+ self._set_log_level()
1050
+
1051
+ logger.debug(
1052
+ f"DataHub listener got notification about dataset create for {dataset}"
1053
+ )
1054
+
1055
+ @hookimpl
1056
+ @run_in_thread
1057
+ def on_dataset_changed(self, dataset: "Dataset") -> None: # type: ignore[no-untyped-def]
1058
+ self._set_log_level()
1059
+
1060
+ logger.debug(
1061
+ f"DataHub listener got notification about dataset change for {dataset}"
1062
+ )
1063
+
1064
+ async def _soft_delete_obsolete_urns(self, obsolete_urns):
1065
+ delete_tasks = [self._delete_obsolete_data(urn) for urn in obsolete_urns]
1066
+ await asyncio.gather(*delete_tasks)
1067
+
1068
+ async def _delete_obsolete_data(self, obsolete_urn):
1069
+ assert self.graph
1070
+
1071
+ if self.graph.exists(str(obsolete_urn)):
1072
+ self.graph.soft_delete_entity(str(obsolete_urn))