apache-airflow-providers-openlineage 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.

@@ -39,15 +39,56 @@ class AirflowMappedTaskRunFacet(BaseFacet):
39
39
 
40
40
  @classmethod
41
41
  def from_task_instance(cls, task_instance):
42
- task = task_instance.task
43
- from airflow.providers.openlineage.utils.utils import get_operator_class
42
+ from airflow.providers.openlineage.utils.utils import get_fully_qualified_class_name
44
43
 
45
44
  return cls(
46
45
  mapIndex=task_instance.map_index,
47
- operatorClass=f"{get_operator_class(task).__module__}.{get_operator_class(task).__name__}",
46
+ operatorClass=get_fully_qualified_class_name(task_instance.task),
48
47
  )
49
48
 
50
49
 
50
+ @define(slots=False)
51
+ class AirflowJobFacet(BaseFacet):
52
+ """
53
+ Composite Airflow job facet.
54
+
55
+ This facet encapsulates all the necessary information to re-create full scope of an Airflow DAG logic,
56
+ enabling reconstruction, visualization, and analysis of DAGs in a comprehensive manner.
57
+ It includes detailed representations of the tasks, task groups, and their hierarchical relationships,
58
+ making it possible to draw a graph that visually represents the entire DAG structure (like in Airflow UI).
59
+ It also indicates whether a task should emit an OpenLineage (OL) event, enabling consumers to anticipate
60
+ the number of events and identify the tasks from which they can expect these events.
61
+
62
+ Attributes:
63
+ taskTree: A dictionary representing the hierarchical structure of tasks in the DAG.
64
+ taskGroups: A dictionary that contains information about task groups within the DAG.
65
+ tasks: A dictionary detailing individual tasks within the DAG.
66
+ """
67
+
68
+ taskTree: dict
69
+ taskGroups: dict
70
+ tasks: dict
71
+
72
+
73
+ @define(slots=False)
74
+ class AirflowStateRunFacet(BaseFacet):
75
+ """
76
+ Airflow facet providing state information.
77
+
78
+ This facet is designed to be sent at a completion event, offering state information about
79
+ the DAG run and each individual task. This information is crucial for understanding
80
+ the execution flow and comprehensive post-run analysis and debugging, including why certain tasks
81
+ did not emit events, which can occur due to the use of control flow operators like the BranchOperator.
82
+
83
+ Attributes:
84
+ dagRunState: This indicates the final status of the entire DAG run (e.g., "success", "failed").
85
+ tasksState: A dictionary mapping task IDs to their respective states. (e.g., "failed", "skipped").
86
+ """
87
+
88
+ dagRunState: str
89
+ tasksState: dict[str, str]
90
+
91
+
51
92
  @define(slots=False)
52
93
  class AirflowRunFacet(BaseFacet):
53
94
  """Composite Airflow run facet."""
@@ -61,7 +102,8 @@ class AirflowRunFacet(BaseFacet):
61
102
 
62
103
  @define(slots=False)
63
104
  class UnknownOperatorInstance(RedactMixin):
64
- """Describes an unknown operator.
105
+ """
106
+ Describes an unknown operator.
65
107
 
66
108
  This specifies the (class) name of the operator and its properties.
67
109
  """
@@ -17,18 +17,23 @@
17
17
  from __future__ import annotations
18
18
 
19
19
  import logging
20
+ import os
20
21
  from concurrent.futures import ProcessPoolExecutor
21
22
  from datetime import datetime
22
23
  from typing import TYPE_CHECKING
23
24
 
25
+ import psutil
24
26
  from openlineage.client.serde import Serde
27
+ from packaging.version import Version
28
+ from setproctitle import getproctitle, setproctitle
25
29
 
26
- from airflow import __version__ as airflow_version, settings
30
+ from airflow import __version__ as AIRFLOW_VERSION, settings
27
31
  from airflow.listeners import hookimpl
28
32
  from airflow.providers.openlineage import conf
29
33
  from airflow.providers.openlineage.extractors import ExtractorManager
30
34
  from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter, RunState
31
35
  from airflow.providers.openlineage.utils.utils import (
36
+ get_airflow_job_facet,
32
37
  get_airflow_run_facet,
33
38
  get_custom_facets,
34
39
  get_job_name,
@@ -36,6 +41,7 @@ from airflow.providers.openlineage.utils.utils import (
36
41
  is_selective_lineage_enabled,
37
42
  print_warning,
38
43
  )
44
+ from airflow.settings import configure_orm
39
45
  from airflow.stats import Stats
40
46
  from airflow.utils.timeout import timeout
41
47
 
@@ -43,18 +49,29 @@ if TYPE_CHECKING:
43
49
  from sqlalchemy.orm import Session
44
50
 
45
51
  from airflow.models import DagRun, TaskInstance
52
+ from airflow.utils.state import TaskInstanceState
46
53
 
47
54
  _openlineage_listener: OpenLineageListener | None = None
55
+ _IS_AIRFLOW_2_10_OR_HIGHER = Version(Version(AIRFLOW_VERSION).base_version) >= Version("2.10.0")
48
56
 
49
57
 
50
58
  def _get_try_number_success(val):
51
59
  # todo: remove when min airflow version >= 2.10.0
52
- from packaging.version import parse
53
-
54
- if parse(parse(airflow_version).base_version) < parse("2.10.0"):
55
- return val.try_number - 1
56
- else:
60
+ if _IS_AIRFLOW_2_10_OR_HIGHER:
57
61
  return val.try_number
62
+ return val.try_number - 1
63
+
64
+
65
+ def _executor_initializer():
66
+ """
67
+ Initialize worker processes for the executor used for DagRun listener.
68
+
69
+ This function must be picklable, so it cannot be defined as an inner method or local function.
70
+
71
+ Reconfigures the ORM engine to prevent issues that arise when multiple processes interact with
72
+ the Airflow database.
73
+ """
74
+ settings.configure_orm()
58
75
 
59
76
 
60
77
  class OpenLineageListener:
@@ -69,10 +86,10 @@ class OpenLineageListener:
69
86
  @hookimpl
70
87
  def on_task_instance_running(
71
88
  self,
72
- previous_state,
89
+ previous_state: TaskInstanceState,
73
90
  task_instance: TaskInstance,
74
91
  session: Session, # This will always be QUEUED
75
- ):
92
+ ) -> None:
76
93
  if not getattr(task_instance, "task", None) is not None:
77
94
  self.log.warning(
78
95
  "No task set for TI object task_id: %s - dag_id: %s - run_id %s",
@@ -111,13 +128,16 @@ class OpenLineageListener:
111
128
  # we return here because Airflow 2.3 needs task from deferred state
112
129
  if task_instance.next_method is not None:
113
130
  return
114
- parent_run_id = self.adapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
131
+ parent_run_id = self.adapter.build_dag_run_id(
132
+ dag_id=dag.dag_id,
133
+ execution_date=dagrun.execution_date,
134
+ )
115
135
 
116
136
  task_uuid = self.adapter.build_task_instance_run_id(
117
137
  dag_id=dag.dag_id,
118
138
  task_id=task.task_id,
119
- execution_date=task_instance.execution_date,
120
139
  try_number=task_instance.try_number,
140
+ execution_date=task_instance.execution_date,
121
141
  )
122
142
  event_type = RunState.RUNNING.value.lower()
123
143
  operator_name = task.task_type.lower()
@@ -130,7 +150,6 @@ class OpenLineageListener:
130
150
  dagrun.data_interval_start.isoformat() if dagrun.data_interval_start else None
131
151
  )
132
152
  data_interval_end = dagrun.data_interval_end.isoformat() if dagrun.data_interval_end else None
133
-
134
153
  redacted_event = self.adapter.start_task(
135
154
  run_id=task_uuid,
136
155
  job_name=get_job_name(task),
@@ -153,10 +172,12 @@ class OpenLineageListener:
153
172
  len(Serde.to_json(redacted_event).encode("utf-8")),
154
173
  )
155
174
 
156
- on_running()
175
+ self._execute(on_running, "on_running", use_fork=True)
157
176
 
158
177
  @hookimpl
159
- def on_task_instance_success(self, previous_state, task_instance: TaskInstance, session):
178
+ def on_task_instance_success(
179
+ self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
180
+ ) -> None:
160
181
  self.log.debug("OpenLineage listener got notification about task instance success")
161
182
 
162
183
  dagrun = task_instance.dag_run
@@ -184,13 +205,16 @@ class OpenLineageListener:
184
205
 
185
206
  @print_warning(self.log)
186
207
  def on_success():
187
- parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
208
+ parent_run_id = OpenLineageAdapter.build_dag_run_id(
209
+ dag_id=dag.dag_id,
210
+ execution_date=dagrun.execution_date,
211
+ )
188
212
 
189
213
  task_uuid = OpenLineageAdapter.build_task_instance_run_id(
190
214
  dag_id=dag.dag_id,
191
215
  task_id=task.task_id,
192
- execution_date=task_instance.execution_date,
193
216
  try_number=_get_try_number_success(task_instance),
217
+ execution_date=task_instance.execution_date,
194
218
  )
195
219
  event_type = RunState.COMPLETE.value.lower()
196
220
  operator_name = task.task_type.lower()
@@ -215,10 +239,39 @@ class OpenLineageListener:
215
239
  len(Serde.to_json(redacted_event).encode("utf-8")),
216
240
  )
217
241
 
218
- on_success()
242
+ self._execute(on_success, "on_success", use_fork=True)
219
243
 
220
- @hookimpl
221
- def on_task_instance_failed(self, previous_state, task_instance: TaskInstance, session):
244
+ if _IS_AIRFLOW_2_10_OR_HIGHER:
245
+
246
+ @hookimpl
247
+ def on_task_instance_failed(
248
+ self,
249
+ previous_state: TaskInstanceState,
250
+ task_instance: TaskInstance,
251
+ error: None | str | BaseException,
252
+ session: Session,
253
+ ) -> None:
254
+ self._on_task_instance_failed(
255
+ previous_state=previous_state, task_instance=task_instance, error=error, session=session
256
+ )
257
+
258
+ else:
259
+
260
+ @hookimpl
261
+ def on_task_instance_failed(
262
+ self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
263
+ ) -> None:
264
+ self._on_task_instance_failed(
265
+ previous_state=previous_state, task_instance=task_instance, error=None, session=session
266
+ )
267
+
268
+ def _on_task_instance_failed(
269
+ self,
270
+ previous_state: TaskInstanceState,
271
+ task_instance: TaskInstance,
272
+ session: Session,
273
+ error: None | str | BaseException = None,
274
+ ) -> None:
222
275
  self.log.debug("OpenLineage listener got notification about task instance failure")
223
276
 
224
277
  dagrun = task_instance.dag_run
@@ -246,13 +299,16 @@ class OpenLineageListener:
246
299
 
247
300
  @print_warning(self.log)
248
301
  def on_failure():
249
- parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
302
+ parent_run_id = OpenLineageAdapter.build_dag_run_id(
303
+ dag_id=dag.dag_id,
304
+ execution_date=dagrun.execution_date,
305
+ )
250
306
 
251
307
  task_uuid = OpenLineageAdapter.build_task_instance_run_id(
252
308
  dag_id=dag.dag_id,
253
309
  task_id=task.task_id,
254
- execution_date=task_instance.execution_date,
255
310
  try_number=task_instance.try_number,
311
+ execution_date=task_instance.execution_date,
256
312
  )
257
313
  event_type = RunState.FAIL.value.lower()
258
314
  operator_name = task.task_type.lower()
@@ -271,40 +327,76 @@ class OpenLineageListener:
271
327
  parent_run_id=parent_run_id,
272
328
  end_time=end_date.isoformat(),
273
329
  task=task_metadata,
330
+ error=error,
274
331
  )
275
332
  Stats.gauge(
276
333
  f"ol.event.size.{event_type}.{operator_name}",
277
334
  len(Serde.to_json(redacted_event).encode("utf-8")),
278
335
  )
279
336
 
280
- on_failure()
337
+ self._execute(on_failure, "on_failure", use_fork=True)
338
+
339
+ def _execute(self, callable, callable_name: str, use_fork: bool = False):
340
+ if use_fork:
341
+ self._fork_execute(callable, callable_name)
342
+ else:
343
+ callable()
344
+
345
+ def _terminate_with_wait(self, process: psutil.Process):
346
+ process.terminate()
347
+ try:
348
+ # Waiting for max 3 seconds to make sure process can clean up before being killed.
349
+ process.wait(timeout=3)
350
+ except psutil.TimeoutExpired:
351
+ # If it's not dead by then, then force kill.
352
+ process.kill()
353
+
354
+ def _fork_execute(self, callable, callable_name: str):
355
+ self.log.debug("Will fork to execute OpenLineage process.")
356
+ pid = os.fork()
357
+ if pid:
358
+ process = psutil.Process(pid)
359
+ try:
360
+ self.log.debug("Waiting for process %s", pid)
361
+ process.wait(conf.execution_timeout())
362
+ except psutil.TimeoutExpired:
363
+ self.log.warning(
364
+ "OpenLineage process %s expired. This should not affect process execution.", pid
365
+ )
366
+ self._terminate_with_wait(process)
367
+ except BaseException:
368
+ # Kill the process directly.
369
+ self._terminate_with_wait(process)
370
+ self.log.debug("Process with pid %s finished - parent", pid)
371
+ else:
372
+ setproctitle(getproctitle() + " - OpenLineage - " + callable_name)
373
+ configure_orm(disable_connection_pool=True)
374
+ self.log.debug("Executing OpenLineage process - %s - pid %s", callable_name, os.getpid())
375
+ callable()
376
+ self.log.debug("Process with current pid finishes after %s", callable_name)
377
+ os._exit(0)
281
378
 
282
379
  @property
283
- def executor(self):
284
- def initializer():
285
- # Re-configure the ORM engine as there are issues with multiple processes
286
- # if process calls Airflow DB.
287
- settings.configure_orm()
288
-
380
+ def executor(self) -> ProcessPoolExecutor:
289
381
  if not self._executor:
290
382
  self._executor = ProcessPoolExecutor(
291
383
  max_workers=conf.dag_state_change_process_pool_size(),
292
- initializer=initializer,
384
+ initializer=_executor_initializer,
293
385
  )
294
386
  return self._executor
295
387
 
296
388
  @hookimpl
297
- def on_starting(self, component):
389
+ def on_starting(self, component) -> None:
298
390
  self.log.debug("on_starting: %s", component.__class__.__name__)
299
391
 
300
392
  @hookimpl
301
- def before_stopping(self, component):
393
+ def before_stopping(self, component) -> None:
302
394
  self.log.debug("before_stopping: %s", component.__class__.__name__)
303
395
  with timeout(30):
304
396
  self.executor.shutdown(wait=True)
305
397
 
306
398
  @hookimpl
307
- def on_dag_run_running(self, dag_run: DagRun, msg: str):
399
+ def on_dag_run_running(self, dag_run: DagRun, msg: str) -> None:
308
400
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
309
401
  self.log.debug(
310
402
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -326,10 +418,13 @@ class OpenLineageListener:
326
418
  msg=msg,
327
419
  nominal_start_time=data_interval_start,
328
420
  nominal_end_time=data_interval_end,
421
+ # AirflowJobFacet should be created outside ProcessPoolExecutor that pickles objects,
422
+ # as it causes lack of some TaskGroup attributes and crashes event emission.
423
+ job_facets={**get_airflow_job_facet(dag_run=dag_run)},
329
424
  )
330
425
 
331
426
  @hookimpl
332
- def on_dag_run_success(self, dag_run: DagRun, msg: str):
427
+ def on_dag_run_success(self, dag_run: DagRun, msg: str) -> None:
333
428
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
334
429
  self.log.debug(
335
430
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -346,7 +441,7 @@ class OpenLineageListener:
346
441
  self.executor.submit(self.adapter.dag_success, dag_run=dag_run, msg=msg)
347
442
 
348
443
  @hookimpl
349
- def on_dag_run_failed(self, dag_run: DagRun, msg: str):
444
+ def on_dag_run_failed(self, dag_run: DagRun, msg: str) -> None:
350
445
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
351
446
  self.log.debug(
352
447
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -61,8 +61,8 @@ def lineage_run_id(task_instance: TaskInstance):
61
61
  return OpenLineageAdapter.build_task_instance_run_id(
62
62
  dag_id=task_instance.dag_id,
63
63
  task_id=task_instance.task_id,
64
- execution_date=task_instance.execution_date,
65
64
  try_number=task_instance.try_number,
65
+ execution_date=task_instance.execution_date,
66
66
  )
67
67
 
68
68
 
@@ -39,6 +39,7 @@ from airflow.providers.openlineage.utils.sql import (
39
39
  get_table_schemas,
40
40
  )
41
41
  from airflow.typing_compat import TypedDict
42
+ from airflow.utils.log.logging_mixin import LoggingMixin
42
43
 
43
44
  if TYPE_CHECKING:
44
45
  from sqlalchemy.engine import Engine
@@ -116,19 +117,27 @@ def from_table_meta(
116
117
  return Dataset(namespace=namespace, name=name if not is_uppercase else name.upper())
117
118
 
118
119
 
119
- class SQLParser:
120
- """Interface for openlineage-sql.
120
+ class SQLParser(LoggingMixin):
121
+ """
122
+ Interface for openlineage-sql.
121
123
 
122
124
  :param dialect: dialect specific to the database
123
125
  :param default_schema: schema applied to each table with no schema parsed
124
126
  """
125
127
 
126
128
  def __init__(self, dialect: str | None = None, default_schema: str | None = None) -> None:
129
+ super().__init__()
127
130
  self.dialect = dialect
128
131
  self.default_schema = default_schema
129
132
 
130
133
  def parse(self, sql: list[str] | str) -> SqlMeta | None:
131
134
  """Parse a single or a list of SQL statements."""
135
+ self.log.debug(
136
+ "OpenLineage calling SQL parser with SQL %s dialect %s schema %s",
137
+ sql,
138
+ self.dialect,
139
+ self.default_schema,
140
+ )
132
141
  return parse(sql=sql, dialect=self.dialect, default_schema=self.default_schema)
133
142
 
134
143
  def parse_table_schemas(
@@ -151,6 +160,7 @@ class SQLParser:
151
160
  "database": database or database_info.database,
152
161
  "use_flat_cross_db_query": database_info.use_flat_cross_db_query,
153
162
  }
163
+ self.log.info("PRE getting schemas for input and output tables")
154
164
  return get_table_schemas(
155
165
  hook,
156
166
  namespace,
@@ -235,7 +245,8 @@ class SQLParser:
235
245
  sqlalchemy_engine: Engine | None = None,
236
246
  use_connection: bool = True,
237
247
  ) -> OperatorLineage:
238
- """Parse SQL statement(s) and generate OpenLineage metadata.
248
+ """
249
+ Parse SQL statement(s) and generate OpenLineage metadata.
239
250
 
240
251
  Generated OpenLineage metadata contains:
241
252
 
@@ -335,9 +346,8 @@ class SQLParser:
335
346
  return split_statement(sql)
336
347
  return [obj for stmt in sql for obj in cls.split_sql_string(stmt) if obj != ""]
337
348
 
338
- @classmethod
339
349
  def create_information_schema_query(
340
- cls,
350
+ self,
341
351
  tables: list[DbTableMeta],
342
352
  normalize_name: Callable[[str], str],
343
353
  is_cross_db: bool,
@@ -349,7 +359,7 @@ class SQLParser:
349
359
  sqlalchemy_engine: Engine | None = None,
350
360
  ) -> str:
351
361
  """Create SELECT statement to query information schema table."""
352
- tables_hierarchy = cls._get_tables_hierarchy(
362
+ tables_hierarchy = self._get_tables_hierarchy(
353
363
  tables,
354
364
  normalize_name=normalize_name,
355
365
  database=database,
@@ -32,7 +32,8 @@ log = logging.getLogger(__name__)
32
32
 
33
33
 
34
34
  def enable_lineage(obj: T) -> T:
35
- """Set selective enable OpenLineage parameter to True.
35
+ """
36
+ Set selective enable OpenLineage parameter to True.
36
37
 
37
38
  The method also propagates param to tasks if the object is DAG.
38
39
  """
@@ -48,7 +49,8 @@ def enable_lineage(obj: T) -> T:
48
49
 
49
50
 
50
51
  def disable_lineage(obj: T) -> T:
51
- """Set selective enable OpenLineage parameter to False.
52
+ """
53
+ Set selective enable OpenLineage parameter to False.
52
54
 
53
55
  The method also propagates param to tasks if the object is DAG.
54
56
  """
@@ -73,7 +75,8 @@ def is_task_lineage_enabled(task: Operator) -> bool:
73
75
 
74
76
 
75
77
  def is_dag_lineage_enabled(dag: DAG) -> bool:
76
- """Check if DAG is selectively enabled to emit OpenLineage events.
78
+ """
79
+ Check if DAG is selectively enabled to emit OpenLineage events.
77
80
 
78
81
  The method also checks if selective enable parameter is set to True
79
82
  or if any of the tasks in DAG is selectively enabled.
@@ -16,6 +16,7 @@
16
16
  # under the License.
17
17
  from __future__ import annotations
18
18
 
19
+ import logging
19
20
  from collections import defaultdict
20
21
  from contextlib import closing
21
22
  from enum import IntEnum
@@ -33,6 +34,9 @@ if TYPE_CHECKING:
33
34
  from airflow.hooks.base import BaseHook
34
35
 
35
36
 
37
+ log = logging.getLogger(__name__)
38
+
39
+
36
40
  class ColumnIndex(IntEnum):
37
41
  """Enumerates the indices of columns in information schema view."""
38
42
 
@@ -81,7 +85,8 @@ def get_table_schemas(
81
85
  in_query: str | None,
82
86
  out_query: str | None,
83
87
  ) -> tuple[list[Dataset], list[Dataset]]:
84
- """Query database for table schemas.
88
+ """
89
+ Query database for table schemas.
85
90
 
86
91
  Uses provided hook. Responsibility to provide queries for this function is on particular extractors.
87
92
  If query for input or output table isn't provided, the query is skipped.
@@ -90,6 +95,7 @@ def get_table_schemas(
90
95
  if not in_query and not out_query:
91
96
  return [], []
92
97
 
98
+ log.debug("Starting to query database for table schemas")
93
99
  with closing(hook.get_conn()) as conn, closing(conn.cursor()) as cursor:
94
100
  if in_query:
95
101
  cursor.execute(in_query)
@@ -101,11 +107,13 @@ def get_table_schemas(
101
107
  out_datasets = [x.to_dataset(namespace, database, schema) for x in parse_query_result(cursor)]
102
108
  else:
103
109
  out_datasets = []
110
+ log.debug("Got table schema query result from database.")
104
111
  return in_datasets, out_datasets
105
112
 
106
113
 
107
114
  def parse_query_result(cursor) -> list[TableSchema]:
108
- """Fetch results from DB-API 2.0 cursor and creates list of table schemas.
115
+ """
116
+ Fetch results from DB-API 2.0 cursor and creates list of table schemas.
109
117
 
110
118
  For each row it creates :class:`TableSchema`.
111
119
  """
@@ -149,7 +157,7 @@ def create_information_schema_query(
149
157
  sqlalchemy_engine: Engine | None = None,
150
158
  ) -> str:
151
159
  """Create query for getting table schemas from information schema."""
152
- metadata = MetaData(sqlalchemy_engine)
160
+ metadata = MetaData()
153
161
  select_statements = []
154
162
  # Don't iterate over tables hierarchy, just pass it to query single information schema table
155
163
  if use_flat_cross_db_query: