apache-airflow-providers-openlineage 1.8.0rc1__py3-none-any.whl → 1.9.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.

@@ -17,18 +17,23 @@
17
17
  from __future__ import annotations
18
18
 
19
19
  import logging
20
+ import os
20
21
  from concurrent.futures import ProcessPoolExecutor
21
22
  from datetime import datetime
22
23
  from typing import TYPE_CHECKING
23
24
 
25
+ import psutil
24
26
  from openlineage.client.serde import Serde
27
+ from packaging.version import Version
28
+ from setproctitle import getproctitle, setproctitle
25
29
 
26
- from airflow import __version__ as airflow_version, settings
30
+ from airflow import __version__ as AIRFLOW_VERSION, settings
27
31
  from airflow.listeners import hookimpl
28
32
  from airflow.providers.openlineage import conf
29
33
  from airflow.providers.openlineage.extractors import ExtractorManager
30
34
  from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter, RunState
31
35
  from airflow.providers.openlineage.utils.utils import (
36
+ get_airflow_job_facet,
32
37
  get_airflow_run_facet,
33
38
  get_custom_facets,
34
39
  get_job_name,
@@ -36,6 +41,7 @@ from airflow.providers.openlineage.utils.utils import (
36
41
  is_selective_lineage_enabled,
37
42
  print_warning,
38
43
  )
44
+ from airflow.settings import configure_orm
39
45
  from airflow.stats import Stats
40
46
  from airflow.utils.timeout import timeout
41
47
 
@@ -43,18 +49,29 @@ if TYPE_CHECKING:
43
49
  from sqlalchemy.orm import Session
44
50
 
45
51
  from airflow.models import DagRun, TaskInstance
52
+ from airflow.utils.state import TaskInstanceState
46
53
 
47
54
  _openlineage_listener: OpenLineageListener | None = None
55
+ _IS_AIRFLOW_2_10_OR_HIGHER = Version(Version(AIRFLOW_VERSION).base_version) >= Version("2.10.0")
48
56
 
49
57
 
50
58
  def _get_try_number_success(val):
51
59
  # todo: remove when min airflow version >= 2.10.0
52
- from packaging.version import parse
53
-
54
- if parse(parse(airflow_version).base_version) < parse("2.10.0"):
55
- return val.try_number - 1
56
- else:
60
+ if _IS_AIRFLOW_2_10_OR_HIGHER:
57
61
  return val.try_number
62
+ return val.try_number - 1
63
+
64
+
65
+ def _executor_initializer():
66
+ """
67
+ Initialize worker processes for the executor used for DagRun listener.
68
+
69
+ This function must be picklable, so it cannot be defined as an inner method or local function.
70
+
71
+ Reconfigures the ORM engine to prevent issues that arise when multiple processes interact with
72
+ the Airflow database.
73
+ """
74
+ settings.configure_orm()
58
75
 
59
76
 
60
77
  class OpenLineageListener:
@@ -69,10 +86,10 @@ class OpenLineageListener:
69
86
  @hookimpl
70
87
  def on_task_instance_running(
71
88
  self,
72
- previous_state,
89
+ previous_state: TaskInstanceState,
73
90
  task_instance: TaskInstance,
74
91
  session: Session, # This will always be QUEUED
75
- ):
92
+ ) -> None:
76
93
  if not getattr(task_instance, "task", None) is not None:
77
94
  self.log.warning(
78
95
  "No task set for TI object task_id: %s - dag_id: %s - run_id %s",
@@ -111,13 +128,16 @@ class OpenLineageListener:
111
128
  # we return here because Airflow 2.3 needs task from deferred state
112
129
  if task_instance.next_method is not None:
113
130
  return
114
- parent_run_id = self.adapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
131
+ parent_run_id = self.adapter.build_dag_run_id(
132
+ dag_id=dag.dag_id,
133
+ execution_date=dagrun.execution_date,
134
+ )
115
135
 
116
136
  task_uuid = self.adapter.build_task_instance_run_id(
117
137
  dag_id=dag.dag_id,
118
138
  task_id=task.task_id,
119
- execution_date=task_instance.execution_date,
120
139
  try_number=task_instance.try_number,
140
+ execution_date=task_instance.execution_date,
121
141
  )
122
142
  event_type = RunState.RUNNING.value.lower()
123
143
  operator_name = task.task_type.lower()
@@ -130,7 +150,6 @@ class OpenLineageListener:
130
150
  dagrun.data_interval_start.isoformat() if dagrun.data_interval_start else None
131
151
  )
132
152
  data_interval_end = dagrun.data_interval_end.isoformat() if dagrun.data_interval_end else None
133
-
134
153
  redacted_event = self.adapter.start_task(
135
154
  run_id=task_uuid,
136
155
  job_name=get_job_name(task),
@@ -153,10 +172,12 @@ class OpenLineageListener:
153
172
  len(Serde.to_json(redacted_event).encode("utf-8")),
154
173
  )
155
174
 
156
- on_running()
175
+ self._execute(on_running, "on_running", use_fork=True)
157
176
 
158
177
  @hookimpl
159
- def on_task_instance_success(self, previous_state, task_instance: TaskInstance, session):
178
+ def on_task_instance_success(
179
+ self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
180
+ ) -> None:
160
181
  self.log.debug("OpenLineage listener got notification about task instance success")
161
182
 
162
183
  dagrun = task_instance.dag_run
@@ -184,13 +205,16 @@ class OpenLineageListener:
184
205
 
185
206
  @print_warning(self.log)
186
207
  def on_success():
187
- parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
208
+ parent_run_id = OpenLineageAdapter.build_dag_run_id(
209
+ dag_id=dag.dag_id,
210
+ execution_date=dagrun.execution_date,
211
+ )
188
212
 
189
213
  task_uuid = OpenLineageAdapter.build_task_instance_run_id(
190
214
  dag_id=dag.dag_id,
191
215
  task_id=task.task_id,
192
- execution_date=task_instance.execution_date,
193
216
  try_number=_get_try_number_success(task_instance),
217
+ execution_date=task_instance.execution_date,
194
218
  )
195
219
  event_type = RunState.COMPLETE.value.lower()
196
220
  operator_name = task.task_type.lower()
@@ -215,10 +239,39 @@ class OpenLineageListener:
215
239
  len(Serde.to_json(redacted_event).encode("utf-8")),
216
240
  )
217
241
 
218
- on_success()
242
+ self._execute(on_success, "on_success", use_fork=True)
219
243
 
220
- @hookimpl
221
- def on_task_instance_failed(self, previous_state, task_instance: TaskInstance, session):
244
+ if _IS_AIRFLOW_2_10_OR_HIGHER:
245
+
246
+ @hookimpl
247
+ def on_task_instance_failed(
248
+ self,
249
+ previous_state: TaskInstanceState,
250
+ task_instance: TaskInstance,
251
+ error: None | str | BaseException,
252
+ session: Session,
253
+ ) -> None:
254
+ self._on_task_instance_failed(
255
+ previous_state=previous_state, task_instance=task_instance, error=error, session=session
256
+ )
257
+
258
+ else:
259
+
260
+ @hookimpl
261
+ def on_task_instance_failed(
262
+ self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
263
+ ) -> None:
264
+ self._on_task_instance_failed(
265
+ previous_state=previous_state, task_instance=task_instance, error=None, session=session
266
+ )
267
+
268
+ def _on_task_instance_failed(
269
+ self,
270
+ previous_state: TaskInstanceState,
271
+ task_instance: TaskInstance,
272
+ session: Session,
273
+ error: None | str | BaseException = None,
274
+ ) -> None:
222
275
  self.log.debug("OpenLineage listener got notification about task instance failure")
223
276
 
224
277
  dagrun = task_instance.dag_run
@@ -246,13 +299,16 @@ class OpenLineageListener:
246
299
 
247
300
  @print_warning(self.log)
248
301
  def on_failure():
249
- parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)
302
+ parent_run_id = OpenLineageAdapter.build_dag_run_id(
303
+ dag_id=dag.dag_id,
304
+ execution_date=dagrun.execution_date,
305
+ )
250
306
 
251
307
  task_uuid = OpenLineageAdapter.build_task_instance_run_id(
252
308
  dag_id=dag.dag_id,
253
309
  task_id=task.task_id,
254
- execution_date=task_instance.execution_date,
255
310
  try_number=task_instance.try_number,
311
+ execution_date=task_instance.execution_date,
256
312
  )
257
313
  event_type = RunState.FAIL.value.lower()
258
314
  operator_name = task.task_type.lower()
@@ -271,40 +327,76 @@ class OpenLineageListener:
271
327
  parent_run_id=parent_run_id,
272
328
  end_time=end_date.isoformat(),
273
329
  task=task_metadata,
330
+ error=error,
274
331
  )
275
332
  Stats.gauge(
276
333
  f"ol.event.size.{event_type}.{operator_name}",
277
334
  len(Serde.to_json(redacted_event).encode("utf-8")),
278
335
  )
279
336
 
280
- on_failure()
337
+ self._execute(on_failure, "on_failure", use_fork=True)
338
+
339
+ def _execute(self, callable, callable_name: str, use_fork: bool = False):
340
+ if use_fork:
341
+ self._fork_execute(callable, callable_name)
342
+ else:
343
+ callable()
344
+
345
+ def _terminate_with_wait(self, process: psutil.Process):
346
+ process.terminate()
347
+ try:
348
+ # Waiting for max 3 seconds to make sure process can clean up before being killed.
349
+ process.wait(timeout=3)
350
+ except psutil.TimeoutExpired:
351
+ # If it's not dead by then, then force kill.
352
+ process.kill()
353
+
354
+ def _fork_execute(self, callable, callable_name: str):
355
+ self.log.debug("Will fork to execute OpenLineage process.")
356
+ pid = os.fork()
357
+ if pid:
358
+ process = psutil.Process(pid)
359
+ try:
360
+ self.log.debug("Waiting for process %s", pid)
361
+ process.wait(conf.execution_timeout())
362
+ except psutil.TimeoutExpired:
363
+ self.log.warning(
364
+ "OpenLineage process %s expired. This should not affect process execution.", pid
365
+ )
366
+ self._terminate_with_wait(process)
367
+ except BaseException:
368
+ # Kill the process directly.
369
+ self._terminate_with_wait(process)
370
+ self.log.warning("Process with pid %s finished - parent", pid)
371
+ else:
372
+ setproctitle(getproctitle() + " - OpenLineage - " + callable_name)
373
+ configure_orm(disable_connection_pool=True)
374
+ self.log.debug("Executing OpenLineage process - %s - pid %s", callable_name, os.getpid())
375
+ callable()
376
+ self.log.debug("Process with current pid finishes after %s", callable_name)
377
+ os._exit(0)
281
378
 
282
379
  @property
283
- def executor(self):
284
- def initializer():
285
- # Re-configure the ORM engine as there are issues with multiple processes
286
- # if process calls Airflow DB.
287
- settings.configure_orm()
288
-
380
+ def executor(self) -> ProcessPoolExecutor:
289
381
  if not self._executor:
290
382
  self._executor = ProcessPoolExecutor(
291
383
  max_workers=conf.dag_state_change_process_pool_size(),
292
- initializer=initializer,
384
+ initializer=_executor_initializer(),
293
385
  )
294
386
  return self._executor
295
387
 
296
388
  @hookimpl
297
- def on_starting(self, component):
389
+ def on_starting(self, component) -> None:
298
390
  self.log.debug("on_starting: %s", component.__class__.__name__)
299
391
 
300
392
  @hookimpl
301
- def before_stopping(self, component):
393
+ def before_stopping(self, component) -> None:
302
394
  self.log.debug("before_stopping: %s", component.__class__.__name__)
303
395
  with timeout(30):
304
396
  self.executor.shutdown(wait=True)
305
397
 
306
398
  @hookimpl
307
- def on_dag_run_running(self, dag_run: DagRun, msg: str):
399
+ def on_dag_run_running(self, dag_run: DagRun, msg: str) -> None:
308
400
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
309
401
  self.log.debug(
310
402
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -326,10 +418,13 @@ class OpenLineageListener:
326
418
  msg=msg,
327
419
  nominal_start_time=data_interval_start,
328
420
  nominal_end_time=data_interval_end,
421
+ # AirflowJobFacet should be created outside ProcessPoolExecutor that pickles objects,
422
+ # as it causes lack of some TaskGroup attributes and crashes event emission.
423
+ job_facets={**get_airflow_job_facet(dag_run=dag_run)},
329
424
  )
330
425
 
331
426
  @hookimpl
332
- def on_dag_run_success(self, dag_run: DagRun, msg: str):
427
+ def on_dag_run_success(self, dag_run: DagRun, msg: str) -> None:
333
428
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
334
429
  self.log.debug(
335
430
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -346,7 +441,7 @@ class OpenLineageListener:
346
441
  self.executor.submit(self.adapter.dag_success, dag_run=dag_run, msg=msg)
347
442
 
348
443
  @hookimpl
349
- def on_dag_run_failed(self, dag_run: DagRun, msg: str):
444
+ def on_dag_run_failed(self, dag_run: DagRun, msg: str) -> None:
350
445
  if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
351
446
  self.log.debug(
352
447
  "Skipping OpenLineage event emission for DAG `%s` "
@@ -61,8 +61,8 @@ def lineage_run_id(task_instance: TaskInstance):
61
61
  return OpenLineageAdapter.build_task_instance_run_id(
62
62
  dag_id=task_instance.dag_id,
63
63
  task_id=task_instance.task_id,
64
- execution_date=task_instance.execution_date,
65
64
  try_number=task_instance.try_number,
65
+ execution_date=task_instance.execution_date,
66
66
  )
67
67
 
68
68
 
@@ -39,6 +39,7 @@ from airflow.providers.openlineage.utils.sql import (
39
39
  get_table_schemas,
40
40
  )
41
41
  from airflow.typing_compat import TypedDict
42
+ from airflow.utils.log.logging_mixin import LoggingMixin
42
43
 
43
44
  if TYPE_CHECKING:
44
45
  from sqlalchemy.engine import Engine
@@ -116,7 +117,7 @@ def from_table_meta(
116
117
  return Dataset(namespace=namespace, name=name if not is_uppercase else name.upper())
117
118
 
118
119
 
119
- class SQLParser:
120
+ class SQLParser(LoggingMixin):
120
121
  """Interface for openlineage-sql.
121
122
 
122
123
  :param dialect: dialect specific to the database
@@ -124,11 +125,18 @@ class SQLParser:
124
125
  """
125
126
 
126
127
  def __init__(self, dialect: str | None = None, default_schema: str | None = None) -> None:
128
+ super().__init__()
127
129
  self.dialect = dialect
128
130
  self.default_schema = default_schema
129
131
 
130
132
  def parse(self, sql: list[str] | str) -> SqlMeta | None:
131
133
  """Parse a single or a list of SQL statements."""
134
+ self.log.debug(
135
+ "OpenLineage calling SQL parser with SQL %s dialect %s schema %s",
136
+ sql,
137
+ self.dialect,
138
+ self.default_schema,
139
+ )
132
140
  return parse(sql=sql, dialect=self.dialect, default_schema=self.default_schema)
133
141
 
134
142
  def parse_table_schemas(
@@ -151,6 +159,7 @@ class SQLParser:
151
159
  "database": database or database_info.database,
152
160
  "use_flat_cross_db_query": database_info.use_flat_cross_db_query,
153
161
  }
162
+ self.log.info("PRE getting schemas for input and output tables")
154
163
  return get_table_schemas(
155
164
  hook,
156
165
  namespace,
@@ -335,9 +344,8 @@ class SQLParser:
335
344
  return split_statement(sql)
336
345
  return [obj for stmt in sql for obj in cls.split_sql_string(stmt) if obj != ""]
337
346
 
338
- @classmethod
339
347
  def create_information_schema_query(
340
- cls,
348
+ self,
341
349
  tables: list[DbTableMeta],
342
350
  normalize_name: Callable[[str], str],
343
351
  is_cross_db: bool,
@@ -349,7 +357,7 @@ class SQLParser:
349
357
  sqlalchemy_engine: Engine | None = None,
350
358
  ) -> str:
351
359
  """Create SELECT statement to query information schema table."""
352
- tables_hierarchy = cls._get_tables_hierarchy(
360
+ tables_hierarchy = self._get_tables_hierarchy(
353
361
  tables,
354
362
  normalize_name=normalize_name,
355
363
  database=database,
@@ -16,6 +16,7 @@
16
16
  # under the License.
17
17
  from __future__ import annotations
18
18
 
19
+ import logging
19
20
  from collections import defaultdict
20
21
  from contextlib import closing
21
22
  from enum import IntEnum
@@ -33,6 +34,9 @@ if TYPE_CHECKING:
33
34
  from airflow.hooks.base import BaseHook
34
35
 
35
36
 
37
+ log = logging.getLogger(__name__)
38
+
39
+
36
40
  class ColumnIndex(IntEnum):
37
41
  """Enumerates the indices of columns in information schema view."""
38
42
 
@@ -90,6 +94,7 @@ def get_table_schemas(
90
94
  if not in_query and not out_query:
91
95
  return [], []
92
96
 
97
+ log.debug("Starting to query database for table schemas")
93
98
  with closing(hook.get_conn()) as conn, closing(conn.cursor()) as cursor:
94
99
  if in_query:
95
100
  cursor.execute(in_query)
@@ -101,6 +106,7 @@ def get_table_schemas(
101
106
  out_datasets = [x.to_dataset(namespace, database, schema) for x in parse_query_result(cursor)]
102
107
  else:
103
108
  out_datasets = []
109
+ log.debug("Got table schema query result from database.")
104
110
  return in_datasets, out_datasets
105
111
 
106
112
 
@@ -149,7 +155,7 @@ def create_information_schema_query(
149
155
  sqlalchemy_engine: Engine | None = None,
150
156
  ) -> str:
151
157
  """Create query for getting table schemas from information schema."""
152
- metadata = MetaData(sqlalchemy_engine)
158
+ metadata = MetaData()
153
159
  select_statements = []
154
160
  # Don't iterate over tables hierarchy, just pass it to query single information schema table
155
161
  if use_flat_cross_db_query: