acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
  3. datahub_airflow_plugin/_airflow_shims.py +31 -64
  4. datahub_airflow_plugin/_config.py +19 -97
  5. datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
  6. datahub_airflow_plugin/_extractors.py +365 -0
  7. datahub_airflow_plugin/_version.py +1 -1
  8. datahub_airflow_plugin/client/airflow_generator.py +43 -147
  9. datahub_airflow_plugin/datahub_listener.py +790 -19
  10. datahub_airflow_plugin/example_dags/__init__.py +0 -32
  11. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
  12. datahub_airflow_plugin/hooks/datahub.py +2 -11
  13. datahub_airflow_plugin/operators/datahub.py +3 -20
  14. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
  15. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
  16. datahub_airflow_plugin/_airflow_compat.py +0 -32
  17. datahub_airflow_plugin/_airflow_version_specific.py +0 -184
  18. datahub_airflow_plugin/_constants.py +0 -16
  19. datahub_airflow_plugin/airflow2/__init__.py +0 -6
  20. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
  21. datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
  22. datahub_airflow_plugin/airflow2/_extractors.py +0 -477
  23. datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
  24. datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
  25. datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
  26. datahub_airflow_plugin/airflow2/_shims.py +0 -88
  27. datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
  28. datahub_airflow_plugin/airflow3/__init__.py +0 -6
  29. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
  30. datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
  31. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
  32. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
  33. datahub_airflow_plugin/airflow3/_shims.py +0 -82
  34. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
  35. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
  36. datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
  37. datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
  38. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
  39. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
  40. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
  41. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
  42. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
  43. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
  44. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
  45. datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
  46. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
  47. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
  48. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
  49. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,796 @@
1
- """
2
- DataHub Airflow Plugin Listener - Version Dispatcher.
1
+ import asyncio
2
+ import copy
3
+ import functools
4
+ import logging
5
+ import os
6
+ import threading
7
+ import time
8
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, TypeVar, cast
3
9
 
4
- This module automatically imports the correct listener implementation based on
5
- the installed Airflow version:
6
- - Airflow 2.x: Uses airflow2 with extractor-based lineage
7
- - Airflow 3.x: Uses airflow3 with native OpenLineage integration
10
+ import airflow
11
+ from airflow.models import Variable
12
+ from airflow.models.operator import Operator
13
+ from airflow.models.serialized_dag import SerializedDagModel
14
+ from openlineage.airflow.listener import TaskHolder
15
+ from openlineage.airflow.utils import redact_with_exclusions
16
+ from openlineage.client.serde import Serde
8
17
 
9
- This approach allows clean type checking for each version without conflicts.
10
- """
18
+ import datahub.emitter.mce_builder as builder
19
+ from datahub.api.entities.datajob import DataJob
20
+ from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult
21
+ from datahub.emitter.mce_builder import (
22
+ make_data_platform_urn,
23
+ make_dataplatform_instance_urn,
24
+ )
25
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
26
+ from datahub.emitter.rest_emitter import DatahubRestEmitter
27
+ from datahub.ingestion.graph.client import DataHubGraph
28
+ from datahub.metadata.schema_classes import (
29
+ BrowsePathEntryClass,
30
+ BrowsePathsV2Class,
31
+ DataFlowKeyClass,
32
+ DataJobKeyClass,
33
+ DataPlatformInstanceClass,
34
+ FineGrainedLineageClass,
35
+ FineGrainedLineageDownstreamTypeClass,
36
+ FineGrainedLineageUpstreamTypeClass,
37
+ OperationClass,
38
+ OperationTypeClass,
39
+ StatusClass,
40
+ )
41
+ from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
42
+ from datahub.telemetry import telemetry
43
+ from datahub_airflow_plugin._airflow_shims import (
44
+ HAS_AIRFLOW_DAG_LISTENER_API,
45
+ HAS_AIRFLOW_DATASET_LISTENER_API,
46
+ get_task_inlets,
47
+ get_task_outlets,
48
+ )
49
+ from datahub_airflow_plugin._config import DatahubLineageConfig, get_lineage_config
50
+ from datahub_airflow_plugin._datahub_ol_adapter import translate_ol_to_datahub_urn
51
+ from datahub_airflow_plugin._extractors import SQL_PARSING_RESULT_KEY, ExtractorManager
52
+ from datahub_airflow_plugin._version import __package_name__, __version__
53
+ from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator
54
+ from datahub_airflow_plugin.entities import (
55
+ _Entity,
56
+ entities_to_datajob_urn_list,
57
+ entities_to_dataset_urn_list,
58
+ )
11
59
 
12
- from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
60
+ _F = TypeVar("_F", bound=Callable[..., None])
61
+ if TYPE_CHECKING:
62
+ from airflow.datasets import Dataset
63
+ from airflow.models import DAG, DagRun, TaskInstance
64
+ from sqlalchemy.orm import Session
65
+
66
+ # To placate mypy on Airflow versions that don't have the listener API,
67
+ # we define a dummy hookimpl that's an identity function.
68
+
69
+ def hookimpl(f: _F) -> _F: # type: ignore[misc]
70
+ return f
13
71
 
14
- if IS_AIRFLOW_3_OR_HIGHER:
15
- from datahub_airflow_plugin.airflow3.datahub_listener import ( # type: ignore[assignment]
16
- DataHubListener,
17
- get_airflow_plugin_listener,
18
- )
19
72
  else:
20
- from datahub_airflow_plugin.airflow2.datahub_listener import ( # type: ignore[assignment]
21
- DataHubListener,
22
- get_airflow_plugin_listener,
23
- )
73
+ from airflow.listeners import hookimpl
74
+
75
+ logger = logging.getLogger(__name__)
76
+
77
+ _airflow_listener_initialized = False
78
+ _airflow_listener: Optional["DataHubListener"] = None
79
+ _RUN_IN_THREAD = os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD", "true").lower() in (
80
+ "true",
81
+ "1",
82
+ )
83
+ _RUN_IN_THREAD_TIMEOUT = float(
84
+ os.getenv("DATAHUB_AIRFLOW_PLUGIN_RUN_IN_THREAD_TIMEOUT", 10)
85
+ )
86
+ _DATAHUB_CLEANUP_DAG = "Datahub_Cleanup"
87
+
88
+ KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener"
89
+
90
+
91
+ def get_airflow_plugin_listener() -> Optional["DataHubListener"]:
92
+ # Using globals instead of functools.lru_cache to make testing easier.
93
+ global _airflow_listener_initialized
94
+ global _airflow_listener
95
+
96
+ if not _airflow_listener_initialized:
97
+ _airflow_listener_initialized = True
98
+
99
+ plugin_config = get_lineage_config()
100
+
101
+ if plugin_config.enabled:
102
+ _airflow_listener = DataHubListener(config=plugin_config)
103
+ logger.info(
104
+ f"DataHub plugin v2 (package: {__package_name__} and version: {__version__}) listener initialized with config: {plugin_config}"
105
+ )
106
+ telemetry.telemetry_instance.ping(
107
+ "airflow-plugin-init",
108
+ {
109
+ "airflow-version": airflow.__version__,
110
+ "datahub-airflow-plugin": "v2",
111
+ "datahub-airflow-plugin-dag-events": HAS_AIRFLOW_DAG_LISTENER_API,
112
+ "datahub-airflow-plugin-dataset-events": HAS_AIRFLOW_DATASET_LISTENER_API,
113
+ "capture_executions": plugin_config.capture_executions,
114
+ "capture_tags": plugin_config.capture_tags_info,
115
+ "capture_ownership": plugin_config.capture_ownership_info,
116
+ "enable_extractors": plugin_config.enable_extractors,
117
+ "render_templates": plugin_config.render_templates,
118
+ "disable_openlineage_plugin": plugin_config.disable_openlineage_plugin,
119
+ },
120
+ )
121
+
122
+ if plugin_config.disable_openlineage_plugin:
123
+ # Deactivate the OpenLineagePlugin listener to avoid conflicts/errors.
124
+ from openlineage.airflow.plugin import OpenLineagePlugin
125
+
126
+ OpenLineagePlugin.listeners = []
127
+
128
+ return _airflow_listener
129
+
130
+
131
+ def run_in_thread(f: _F) -> _F:
132
+ # This is also responsible for catching exceptions and logging them.
133
+
134
+ @functools.wraps(f)
135
+ def wrapper(*args, **kwargs):
136
+ try:
137
+ if _RUN_IN_THREAD:
138
+ # A poor-man's timeout mechanism.
139
+ # This ensures that we don't hang the task if the extractors
140
+ # are slow or the DataHub API is slow to respond.
141
+
142
+ thread = threading.Thread(
143
+ target=f, args=args, kwargs=kwargs, daemon=True
144
+ )
145
+ thread.start()
146
+
147
+ if _RUN_IN_THREAD_TIMEOUT > 0:
148
+ # If _RUN_IN_THREAD_TIMEOUT is 0, we just kick off the thread and move on.
149
+ # Because it's a daemon thread, it'll be automatically killed when the main
150
+ # thread exists.
151
+
152
+ start_time = time.time()
153
+ thread.join(timeout=_RUN_IN_THREAD_TIMEOUT)
154
+ if thread.is_alive():
155
+ logger.warning(
156
+ f"Thread for {f.__name__} is still running after {_RUN_IN_THREAD_TIMEOUT} seconds. "
157
+ "Continuing without waiting for it to finish."
158
+ )
159
+ else:
160
+ logger.debug(
161
+ f"Thread for {f.__name__} finished after {time.time() - start_time} seconds"
162
+ )
163
+ else:
164
+ f(*args, **kwargs)
165
+ except Exception as e:
166
+ logger.warning(e, exc_info=True)
167
+
168
+ return cast(_F, wrapper)
169
+
170
+
171
+ def _render_templates(task_instance: "TaskInstance") -> "TaskInstance":
172
+ # Render templates in a copy of the task instance.
173
+ # This is necessary to get the correct operator args in the extractors.
174
+ try:
175
+ task_instance_copy = copy.deepcopy(task_instance)
176
+ task_instance_copy.render_templates()
177
+ return task_instance_copy
178
+ except Exception as e:
179
+ logger.info(
180
+ f"Error rendering templates in DataHub listener. Jinja-templated variables will not be extracted correctly: {e}. Template rendering improves SQL parsing accuracy. If this causes issues, you can disable it by setting `render_templates` to `false` in the DataHub plugin configuration."
181
+ )
182
+ return task_instance
183
+
184
+
185
+ class DataHubListener:
186
+ __name__ = "DataHubListener"
187
+
188
+ def __init__(self, config: DatahubLineageConfig):
189
+ self.config = config
190
+ self._set_log_level()
191
+
192
+ self._emitter = config.make_emitter_hook().make_emitter()
193
+ self._graph: Optional[DataHubGraph] = None
194
+ logger.info(f"DataHub plugin v2 using {repr(self._emitter)}")
195
+
196
+ # See discussion here https://github.com/OpenLineage/OpenLineage/pull/508 for
197
+ # why we need to keep track of tasks ourselves.
198
+ self._task_holder = TaskHolder()
199
+
200
+ # In our case, we also want to cache the initial datajob object
201
+ # so that we can add to it when the task completes.
202
+ self._datajob_holder: Dict[str, DataJob] = {}
203
+
204
+ self.extractor_manager = ExtractorManager()
205
+
206
+ # This "inherits" from types.ModuleType to avoid issues with Airflow's listener plugin loader.
207
+ # It previously (v2.4.x and likely other versions too) would throw errors if it was not a module.
208
+ # https://github.com/apache/airflow/blob/e99a518970b2d349a75b1647f6b738c8510fa40e/airflow/listeners/listener.py#L56
209
+ # self.__class__ = types.ModuleType
210
+
211
+ @property
212
+ def emitter(self):
213
+ return self._emitter
214
+
215
+ @property
216
+ def graph(self) -> Optional[DataHubGraph]:
217
+ if self._graph:
218
+ return self._graph
219
+
220
+ if isinstance(self._emitter, DatahubRestEmitter) and not isinstance(
221
+ self._emitter, DataHubGraph
222
+ ):
223
+ # This is lazy initialized to avoid throwing errors on plugin load.
224
+ self._graph = self._emitter.to_graph()
225
+ self._emitter = self._graph
226
+
227
+ return self._graph
228
+
229
+ def _set_log_level(self) -> None:
230
+ """Set the log level for the plugin and its dependencies.
231
+
232
+ This may need to be called multiple times, since Airflow sometimes
233
+ messes with the logging configuration after the plugin is loaded.
234
+ In particular, the loggers may get changed when the worker starts
235
+ executing a task.
236
+ """
237
+
238
+ if self.config.log_level:
239
+ logging.getLogger(__name__.split(".")[0]).setLevel(self.config.log_level)
240
+ if self.config.debug_emitter:
241
+ logging.getLogger("datahub.emitter").setLevel(logging.DEBUG)
242
+
243
+ def _make_emit_callback(self) -> Callable[[Optional[Exception], str], None]:
244
+ def emit_callback(err: Optional[Exception], msg: str) -> None:
245
+ if err:
246
+ logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err)
247
+
248
+ return emit_callback
249
+
250
+ def _extract_lineage(
251
+ self,
252
+ datajob: DataJob,
253
+ dagrun: "DagRun",
254
+ task: "Operator",
255
+ task_instance: "TaskInstance",
256
+ complete: bool = False,
257
+ ) -> None:
258
+ """
259
+ Combine lineage (including column lineage) from task inlets/outlets and
260
+ extractor-generated task_metadata and write it to the datajob. This
261
+ routine is also responsible for converting the lineage to DataHub URNs.
262
+ """
263
+
264
+ if not self.config.enable_datajob_lineage:
265
+ return
266
+
267
+ input_urns: List[str] = []
268
+ output_urns: List[str] = []
269
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
270
+
271
+ task_metadata = None
272
+ if self.config.enable_extractors:
273
+ task_metadata = self.extractor_manager.extract_metadata(
274
+ dagrun,
275
+ task,
276
+ complete=complete,
277
+ task_instance=task_instance,
278
+ task_uuid=str(datajob.urn),
279
+ graph=self.graph,
280
+ )
281
+ logger.debug(f"Got task metadata: {task_metadata}")
282
+
283
+ # Translate task_metadata.inputs/outputs to DataHub URNs.
284
+ input_urns.extend(
285
+ translate_ol_to_datahub_urn(dataset) for dataset in task_metadata.inputs
286
+ )
287
+ output_urns.extend(
288
+ translate_ol_to_datahub_urn(dataset)
289
+ for dataset in task_metadata.outputs
290
+ )
291
+
292
+ # Add DataHub-native SQL parser results.
293
+ sql_parsing_result: Optional[SqlParsingResult] = None
294
+ if task_metadata:
295
+ sql_parsing_result = task_metadata.run_facets.pop(
296
+ SQL_PARSING_RESULT_KEY, None
297
+ )
298
+ if sql_parsing_result:
299
+ if error := sql_parsing_result.debug_info.error:
300
+ logger.info(f"SQL parsing error: {error}", exc_info=error)
301
+ datajob.properties["datahub_sql_parser_error"] = (
302
+ f"{type(error).__name__}: {error}"
303
+ )
304
+ if not sql_parsing_result.debug_info.table_error:
305
+ input_urns.extend(sql_parsing_result.in_tables)
306
+ output_urns.extend(sql_parsing_result.out_tables)
307
+
308
+ if sql_parsing_result.column_lineage:
309
+ fine_grained_lineages.extend(
310
+ FineGrainedLineageClass(
311
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
312
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
313
+ upstreams=[
314
+ builder.make_schema_field_urn(
315
+ upstream.table, upstream.column
316
+ )
317
+ for upstream in column_lineage.upstreams
318
+ ],
319
+ downstreams=[
320
+ builder.make_schema_field_urn(
321
+ downstream.table, downstream.column
322
+ )
323
+ for downstream in [column_lineage.downstream]
324
+ if downstream.table
325
+ ],
326
+ )
327
+ for column_lineage in sql_parsing_result.column_lineage
328
+ )
329
+
330
+ # Add DataHub-native inlets/outlets.
331
+ # These are filtered out by the extractor, so we need to add them manually.
332
+ input_urns.extend(
333
+ iolet.urn for iolet in get_task_inlets(task) if isinstance(iolet, _Entity)
334
+ )
335
+ output_urns.extend(
336
+ iolet.urn for iolet in get_task_outlets(task) if isinstance(iolet, _Entity)
337
+ )
338
+
339
+ # Write the lineage to the datajob object.
340
+ datajob.inlets.extend(entities_to_dataset_urn_list(input_urns))
341
+ datajob.outlets.extend(entities_to_dataset_urn_list(output_urns))
342
+ datajob.upstream_urns.extend(entities_to_datajob_urn_list(input_urns))
343
+ datajob.fine_grained_lineages.extend(fine_grained_lineages)
344
+
345
+ # Merge in extra stuff that was present in the DataJob we constructed
346
+ # at the start of the task.
347
+ if complete:
348
+ original_datajob = self._datajob_holder.get(str(datajob.urn), None)
349
+ else:
350
+ self._datajob_holder[str(datajob.urn)] = datajob
351
+ original_datajob = None
352
+
353
+ if original_datajob:
354
+ logger.debug("Merging start datajob into finish datajob")
355
+ datajob.inlets.extend(original_datajob.inlets)
356
+ datajob.outlets.extend(original_datajob.outlets)
357
+ datajob.upstream_urns.extend(original_datajob.upstream_urns)
358
+ datajob.fine_grained_lineages.extend(original_datajob.fine_grained_lineages)
359
+
360
+ for k, v in original_datajob.properties.items():
361
+ datajob.properties.setdefault(k, v)
362
+
363
+ # Deduplicate inlets/outlets.
364
+ datajob.inlets = list(sorted(set(datajob.inlets), key=lambda x: str(x)))
365
+ datajob.outlets = list(sorted(set(datajob.outlets), key=lambda x: str(x)))
366
+ datajob.upstream_urns = list(
367
+ sorted(set(datajob.upstream_urns), key=lambda x: str(x))
368
+ )
369
+
370
+ # Write all other OL facets as DataHub properties.
371
+ if task_metadata:
372
+ for k, v in task_metadata.job_facets.items():
373
+ datajob.properties[f"openlineage_job_facet_{k}"] = Serde.to_json(
374
+ redact_with_exclusions(v)
375
+ )
376
+
377
+ for k, v in task_metadata.run_facets.items():
378
+ datajob.properties[f"openlineage_run_facet_{k}"] = Serde.to_json(
379
+ redact_with_exclusions(v)
380
+ )
381
+
382
+ def check_kill_switch(self):
383
+ if Variable.get(KILL_SWITCH_VARIABLE_NAME, "false").lower() == "true":
384
+ logger.debug("DataHub listener disabled by kill switch")
385
+ return True
386
+ return False
387
+
388
+ @hookimpl
389
+ @run_in_thread
390
+ def on_task_instance_running(
391
+ self,
392
+ previous_state: None,
393
+ task_instance: "TaskInstance",
394
+ session: "Session", # This will always be QUEUED
395
+ ) -> None:
396
+ if self.check_kill_switch():
397
+ return
398
+ self._set_log_level()
399
+
400
+ # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508.
401
+ if not hasattr(task_instance, "task"):
402
+ # The type ignore is to placate mypy on Airflow 2.1.x.
403
+ logger.warning(
404
+ f"No task set for task_id: {task_instance.task_id} - " # type: ignore[attr-defined]
405
+ f"dag_id: {task_instance.dag_id} - run_id {task_instance.run_id}" # type: ignore[attr-defined]
406
+ )
407
+ return
408
+
409
+ logger.debug(
410
+ f"DataHub listener got notification about task instance start for {task_instance.task_id} of dag {task_instance.dag_id}"
411
+ )
412
+
413
+ if not self.config.dag_filter_pattern.allowed(task_instance.dag_id):
414
+ logger.debug(f"DAG {task_instance.dag_id} is not allowed by the pattern")
415
+ return
416
+
417
+ if self.config.render_templates:
418
+ task_instance = _render_templates(task_instance)
419
+
420
+ # The type ignore is to placate mypy on Airflow 2.1.x.
421
+ dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined]
422
+ task = task_instance.task
423
+ assert task is not None
424
+ dag: "DAG" = task.dag # type: ignore[assignment]
425
+
426
+ self._task_holder.set_task(task_instance)
427
+
428
+ # Handle async operators in Airflow 2.3 by skipping deferred state.
429
+ # Inspired by https://github.com/OpenLineage/OpenLineage/pull/1601
430
+ if task_instance.next_method is not None: # type: ignore[attr-defined]
431
+ return
432
+
433
+ datajob = AirflowGenerator.generate_datajob(
434
+ cluster=self.config.cluster,
435
+ task=task,
436
+ dag=dag,
437
+ capture_tags=self.config.capture_tags_info,
438
+ capture_owner=self.config.capture_ownership_info,
439
+ config=self.config,
440
+ )
441
+
442
+ # TODO: Make use of get_task_location to extract github urls.
443
+
444
+ # Add lineage info.
445
+ self._extract_lineage(datajob, dagrun, task, task_instance)
446
+
447
+ # TODO: Add handling for Airflow mapped tasks using task_instance.map_index
448
+
449
+ for mcp in datajob.generate_mcp(
450
+ generate_lineage=self.config.enable_datajob_lineage,
451
+ materialize_iolets=self.config.materialize_iolets,
452
+ ):
453
+ self.emitter.emit(mcp, self._make_emit_callback())
454
+ logger.debug(f"Emitted DataHub Datajob start: {datajob}")
455
+
456
+ if self.config.capture_executions:
457
+ dpi = AirflowGenerator.run_datajob(
458
+ emitter=self.emitter,
459
+ config=self.config,
460
+ ti=task_instance,
461
+ dag=dag,
462
+ dag_run=dagrun,
463
+ datajob=datajob,
464
+ emit_templates=False,
465
+ )
466
+ logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}")
467
+
468
+ self.emitter.flush()
469
+
470
+ logger.debug(
471
+ f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}"
472
+ )
473
+
474
+ self.materialize_iolets(datajob)
475
+
476
+ def materialize_iolets(self, datajob: DataJob) -> None:
477
+ if self.config.materialize_iolets:
478
+ for outlet in datajob.outlets:
479
+ reported_time: int = int(time.time() * 1000)
480
+ operation = OperationClass(
481
+ timestampMillis=reported_time,
482
+ operationType=OperationTypeClass.CREATE,
483
+ lastUpdatedTimestamp=reported_time,
484
+ actor=builder.make_user_urn("airflow"),
485
+ )
486
+
487
+ operation_mcp = MetadataChangeProposalWrapper(
488
+ entityUrn=str(outlet), aspect=operation
489
+ )
490
+
491
+ self.emitter.emit(operation_mcp)
492
+ logger.debug(f"Emitted Dataset Operation: {outlet}")
493
+ else:
494
+ if self.graph:
495
+ for outlet in datajob.outlets:
496
+ if not self.graph.exists(str(outlet)):
497
+ logger.warning(f"Dataset {str(outlet)} not materialized")
498
+ for inlet in datajob.inlets:
499
+ if not self.graph.exists(str(inlet)):
500
+ logger.warning(f"Dataset {str(inlet)} not materialized")
501
+
502
+ def on_task_instance_finish(
503
+ self, task_instance: "TaskInstance", status: InstanceRunResult
504
+ ) -> None:
505
+ dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined]
506
+
507
+ if self.config.render_templates:
508
+ task_instance = _render_templates(task_instance)
509
+
510
+ # We must prefer the task attribute, in case modifications to the task's inlets/outlets
511
+ # were made by the execute() method.
512
+ if getattr(task_instance, "task", None):
513
+ task = task_instance.task
514
+ else:
515
+ task = self._task_holder.get_task(task_instance)
516
+ assert task is not None
517
+
518
+ dag: "DAG" = task.dag # type: ignore[assignment]
519
+
520
+ if not self.config.dag_filter_pattern.allowed(dag.dag_id):
521
+ logger.debug(f"DAG {dag.dag_id} is not allowed by the pattern")
522
+ return
523
+
524
+ datajob = AirflowGenerator.generate_datajob(
525
+ cluster=self.config.cluster,
526
+ task=task,
527
+ dag=dag,
528
+ capture_tags=self.config.capture_tags_info,
529
+ capture_owner=self.config.capture_ownership_info,
530
+ config=self.config,
531
+ )
532
+
533
+ # Add lineage info.
534
+ self._extract_lineage(datajob, dagrun, task, task_instance, complete=True)
535
+
536
+ for mcp in datajob.generate_mcp(
537
+ generate_lineage=self.config.enable_datajob_lineage,
538
+ materialize_iolets=self.config.materialize_iolets,
539
+ ):
540
+ self.emitter.emit(mcp, self._make_emit_callback())
541
+ logger.debug(f"Emitted DataHub Datajob finish w/ status {status}: {datajob}")
542
+
543
+ if self.config.capture_executions:
544
+ dpi = AirflowGenerator.complete_datajob(
545
+ emitter=self.emitter,
546
+ cluster=self.config.cluster,
547
+ ti=task_instance,
548
+ dag=dag,
549
+ dag_run=dagrun,
550
+ datajob=datajob,
551
+ result=status,
552
+ config=self.config,
553
+ )
554
+ logger.debug(
555
+ f"Emitted DataHub DataProcess Instance with status {status}: {dpi}"
556
+ )
557
+
558
+ self.emitter.flush()
559
+
560
+ @hookimpl
561
+ @run_in_thread
562
+ def on_task_instance_success(
563
+ self, previous_state: None, task_instance: "TaskInstance", session: "Session"
564
+ ) -> None:
565
+ if self.check_kill_switch():
566
+ return
567
+
568
+ self._set_log_level()
569
+
570
+ logger.debug(
571
+ f"DataHub listener got notification about task instance success for {task_instance.task_id}"
572
+ )
573
+ self.on_task_instance_finish(task_instance, status=InstanceRunResult.SUCCESS)
574
+ logger.debug(
575
+ f"DataHub listener finished processing task instance success for {task_instance.task_id}"
576
+ )
577
+
578
+ @hookimpl
579
+ @run_in_thread
580
+ def on_task_instance_failed(
581
+ self, previous_state: None, task_instance: "TaskInstance", session: "Session"
582
+ ) -> None:
583
+ if self.check_kill_switch():
584
+ return
585
+
586
+ self._set_log_level()
587
+
588
+ logger.debug(
589
+ f"DataHub listener got notification about task instance failure for {task_instance.task_id}"
590
+ )
591
+
592
+ # TODO: Handle UP_FOR_RETRY state.
593
+ self.on_task_instance_finish(task_instance, status=InstanceRunResult.FAILURE)
594
+ logger.debug(
595
+ f"DataHub listener finished processing task instance failure for {task_instance.task_id}"
596
+ )
597
+
598
+ def on_dag_start(self, dag_run: "DagRun") -> None:
599
+ dag = dag_run.dag
600
+ if not dag:
601
+ logger.warning(
602
+ f"DataHub listener could not find DAG for {dag_run.dag_id} - {dag_run.run_id}. Dag won't be captured"
603
+ )
604
+ return
605
+
606
+ dataflow = AirflowGenerator.generate_dataflow(
607
+ config=self.config,
608
+ dag=dag,
609
+ )
610
+ dataflow.emit(self.emitter, callback=self._make_emit_callback())
611
+ logger.debug(f"Emitted DataHub DataFlow: {dataflow}")
612
+
613
+ event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
614
+ entityUrn=str(dataflow.urn), aspect=StatusClass(removed=False)
615
+ )
616
+ self.emitter.emit(event)
617
+
618
+ for task in dag.tasks:
619
+ task_urn = builder.make_data_job_urn_with_flow(
620
+ str(dataflow.urn), task.task_id
621
+ )
622
+ event = MetadataChangeProposalWrapper(
623
+ entityUrn=task_urn, aspect=StatusClass(removed=False)
624
+ )
625
+ self.emitter.emit(event)
626
+
627
+ if self.config.platform_instance:
628
+ instance = make_dataplatform_instance_urn(
629
+ platform="airflow",
630
+ instance=self.config.platform_instance,
631
+ )
632
+ event = MetadataChangeProposalWrapper(
633
+ entityUrn=str(dataflow.urn),
634
+ aspect=DataPlatformInstanceClass(
635
+ platform=make_data_platform_urn("airflow"),
636
+ instance=instance,
637
+ ),
638
+ )
639
+ self.emitter.emit(event)
640
+
641
+ # emit tags
642
+ for tag in dataflow.tags:
643
+ tag_urn = builder.make_tag_urn(tag)
644
+
645
+ event = MetadataChangeProposalWrapper(
646
+ entityUrn=tag_urn, aspect=StatusClass(removed=False)
647
+ )
648
+ self.emitter.emit(event)
649
+
650
+ browsePaths: List[BrowsePathEntryClass] = []
651
+ if self.config.platform_instance:
652
+ urn = make_dataplatform_instance_urn(
653
+ "airflow", self.config.platform_instance
654
+ )
655
+ browsePaths.append(BrowsePathEntryClass(self.config.platform_instance, urn))
656
+ browsePaths.append(BrowsePathEntryClass(str(dag.dag_id)))
657
+ browse_path_v2_event: MetadataChangeProposalWrapper = (
658
+ MetadataChangeProposalWrapper(
659
+ entityUrn=str(dataflow.urn),
660
+ aspect=BrowsePathsV2Class(
661
+ path=browsePaths,
662
+ ),
663
+ )
664
+ )
665
+ self.emitter.emit(browse_path_v2_event)
666
+
667
+ if dag.dag_id == _DATAHUB_CLEANUP_DAG:
668
+ assert self.graph
669
+
670
+ logger.debug("Initiating the cleanup of obsolete data from datahub")
671
+
672
+ # get all ingested dataflow and datajob
673
+ ingested_dataflow_urns = list(
674
+ self.graph.get_urns_by_filter(
675
+ platform="airflow",
676
+ entity_types=["dataFlow"],
677
+ platform_instance=self.config.platform_instance,
678
+ )
679
+ )
680
+ ingested_datajob_urns = list(
681
+ self.graph.get_urns_by_filter(
682
+ platform="airflow",
683
+ entity_types=["dataJob"],
684
+ platform_instance=self.config.platform_instance,
685
+ )
686
+ )
687
+
688
+ # filter the ingested dataflow and datajob based on the cluster
689
+ filtered_ingested_dataflow_urns: List = []
690
+ filtered_ingested_datajob_urns: List = []
691
+
692
+ for ingested_dataflow_urn in ingested_dataflow_urns:
693
+ data_flow_aspect = self.graph.get_aspect(
694
+ entity_urn=ingested_dataflow_urn, aspect_type=DataFlowKeyClass
695
+ )
696
+ if (
697
+ data_flow_aspect is not None
698
+ and data_flow_aspect.flowId != _DATAHUB_CLEANUP_DAG
699
+ and data_flow_aspect is not None
700
+ and data_flow_aspect.cluster == self.config.cluster
701
+ ):
702
+ filtered_ingested_dataflow_urns.append(ingested_dataflow_urn)
703
+
704
+ for ingested_datajob_urn in ingested_datajob_urns:
705
+ data_job_aspect = self.graph.get_aspect(
706
+ entity_urn=ingested_datajob_urn, aspect_type=DataJobKeyClass
707
+ )
708
+ if (
709
+ data_job_aspect is not None
710
+ and data_job_aspect.flow in filtered_ingested_dataflow_urns
711
+ ):
712
+ filtered_ingested_datajob_urns.append(ingested_datajob_urn)
713
+
714
+ # get all airflow dags
715
+ all_airflow_dags = SerializedDagModel.read_all_dags().values()
716
+
717
+ airflow_flow_urns: List = []
718
+ airflow_job_urns: List = []
719
+
720
+ for dag in all_airflow_dags:
721
+ flow_urn = builder.make_data_flow_urn(
722
+ orchestrator="airflow",
723
+ flow_id=dag.dag_id,
724
+ cluster=self.config.cluster,
725
+ platform_instance=self.config.platform_instance,
726
+ )
727
+ airflow_flow_urns.append(flow_urn)
728
+
729
+ for task in dag.tasks:
730
+ airflow_job_urns.append(
731
+ builder.make_data_job_urn_with_flow(str(flow_urn), task.task_id)
732
+ )
733
+
734
+ obsolete_pipelines = set(filtered_ingested_dataflow_urns) - set(
735
+ airflow_flow_urns
736
+ )
737
+ obsolete_tasks = set(filtered_ingested_datajob_urns) - set(airflow_job_urns)
738
+
739
+ obsolete_urns = obsolete_pipelines.union(obsolete_tasks)
740
+
741
+ asyncio.run(self._soft_delete_obsolete_urns(obsolete_urns=obsolete_urns))
742
+
743
+ logger.debug(f"total pipelines removed = {len(obsolete_pipelines)}")
744
+ logger.debug(f"total tasks removed = {len(obsolete_tasks)}")
745
+
746
+ @hookimpl
747
+ @run_in_thread
748
+ def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None:
749
+ if self.check_kill_switch():
750
+ return
751
+
752
+ self._set_log_level()
753
+
754
+ logger.debug(
755
+ f"DataHub listener got notification about dag run start for {dag_run.dag_id}"
756
+ )
757
+
758
+ assert dag_run.dag_id
759
+ if not self.config.dag_filter_pattern.allowed(dag_run.dag_id):
760
+ logger.debug(f"DAG {dag_run.dag_id} is not allowed by the pattern")
761
+ return
762
+
763
+ self.on_dag_start(dag_run)
764
+ self.emitter.flush()
765
+
766
+ # TODO: Add hooks for on_dag_run_success, on_dag_run_failed -> call AirflowGenerator.complete_dataflow
767
+
768
+ if HAS_AIRFLOW_DATASET_LISTENER_API:
769
+
770
+ @hookimpl
771
+ @run_in_thread
772
+ def on_dataset_created(self, dataset: "Dataset") -> None:
773
+ self._set_log_level()
774
+
775
+ logger.debug(
776
+ f"DataHub listener got notification about dataset create for {dataset}"
777
+ )
778
+
779
+ @hookimpl
780
+ @run_in_thread
781
+ def on_dataset_changed(self, dataset: "Dataset") -> None:
782
+ self._set_log_level()
783
+
784
+ logger.debug(
785
+ f"DataHub listener got notification about dataset change for {dataset}"
786
+ )
787
+
788
+ async def _soft_delete_obsolete_urns(self, obsolete_urns):
789
+ delete_tasks = [self._delete_obsolete_data(urn) for urn in obsolete_urns]
790
+ await asyncio.gather(*delete_tasks)
791
+
792
+ async def _delete_obsolete_data(self, obsolete_urn):
793
+ assert self.graph
24
794
 
25
- __all__ = ["DataHubListener", "get_airflow_plugin_listener"]
795
+ if self.graph.exists(str(obsolete_urn)):
796
+ self.graph.soft_delete_entity(str(obsolete_urn))