apache-airflow-providers-openlineage 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.
- airflow/providers/openlineage/LICENSE +4 -4
- airflow/providers/openlineage/__init__.py +1 -1
- airflow/providers/openlineage/conf.py +16 -1
- airflow/providers/openlineage/extractors/base.py +6 -3
- airflow/providers/openlineage/facets/AirflowJobFacet.json +40 -0
- airflow/providers/openlineage/facets/AirflowRunFacet.json +261 -0
- airflow/providers/openlineage/facets/AirflowStateRunFacet.json +34 -0
- airflow/providers/openlineage/facets/__init__.py +16 -0
- airflow/providers/openlineage/get_provider_info.py +11 -3
- airflow/providers/openlineage/plugins/adapter.py +69 -15
- airflow/providers/openlineage/plugins/facets.py +46 -4
- airflow/providers/openlineage/plugins/listener.py +128 -33
- airflow/providers/openlineage/plugins/macros.py +1 -1
- airflow/providers/openlineage/sqlparser.py +16 -6
- airflow/providers/openlineage/utils/selective_enable.py +6 -3
- airflow/providers/openlineage/utils/sql.py +11 -3
- airflow/providers/openlineage/utils/utils.py +191 -22
- {apache_airflow_providers_openlineage-1.8.0.dist-info → apache_airflow_providers_openlineage-1.9.0.dist-info}/METADATA +10 -10
- apache_airflow_providers_openlineage-1.9.0.dist-info/RECORD +28 -0
- apache_airflow_providers_openlineage-1.8.0.dist-info/RECORD +0 -24
- {apache_airflow_providers_openlineage-1.8.0.dist-info → apache_airflow_providers_openlineage-1.9.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_openlineage-1.8.0.dist-info → apache_airflow_providers_openlineage-1.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -39,15 +39,56 @@ class AirflowMappedTaskRunFacet(BaseFacet):
|
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
41
|
def from_task_instance(cls, task_instance):
|
|
42
|
-
|
|
43
|
-
from airflow.providers.openlineage.utils.utils import get_operator_class
|
|
42
|
+
from airflow.providers.openlineage.utils.utils import get_fully_qualified_class_name
|
|
44
43
|
|
|
45
44
|
return cls(
|
|
46
45
|
mapIndex=task_instance.map_index,
|
|
47
|
-
operatorClass=
|
|
46
|
+
operatorClass=get_fully_qualified_class_name(task_instance.task),
|
|
48
47
|
)
|
|
49
48
|
|
|
50
49
|
|
|
50
|
+
@define(slots=False)
|
|
51
|
+
class AirflowJobFacet(BaseFacet):
|
|
52
|
+
"""
|
|
53
|
+
Composite Airflow job facet.
|
|
54
|
+
|
|
55
|
+
This facet encapsulates all the necessary information to re-create full scope of an Airflow DAG logic,
|
|
56
|
+
enabling reconstruction, visualization, and analysis of DAGs in a comprehensive manner.
|
|
57
|
+
It includes detailed representations of the tasks, task groups, and their hierarchical relationships,
|
|
58
|
+
making it possible to draw a graph that visually represents the entire DAG structure (like in Airflow UI).
|
|
59
|
+
It also indicates whether a task should emit an OpenLineage (OL) event, enabling consumers to anticipate
|
|
60
|
+
the number of events and identify the tasks from which they can expect these events.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
taskTree: A dictionary representing the hierarchical structure of tasks in the DAG.
|
|
64
|
+
taskGroups: A dictionary that contains information about task groups within the DAG.
|
|
65
|
+
tasks: A dictionary detailing individual tasks within the DAG.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
taskTree: dict
|
|
69
|
+
taskGroups: dict
|
|
70
|
+
tasks: dict
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@define(slots=False)
|
|
74
|
+
class AirflowStateRunFacet(BaseFacet):
|
|
75
|
+
"""
|
|
76
|
+
Airflow facet providing state information.
|
|
77
|
+
|
|
78
|
+
This facet is designed to be sent at a completion event, offering state information about
|
|
79
|
+
the DAG run and each individual task. This information is crucial for understanding
|
|
80
|
+
the execution flow and comprehensive post-run analysis and debugging, including why certain tasks
|
|
81
|
+
did not emit events, which can occur due to the use of control flow operators like the BranchOperator.
|
|
82
|
+
|
|
83
|
+
Attributes:
|
|
84
|
+
dagRunState: This indicates the final status of the entire DAG run (e.g., "success", "failed").
|
|
85
|
+
tasksState: A dictionary mapping task IDs to their respective states. (e.g., "failed", "skipped").
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
dagRunState: str
|
|
89
|
+
tasksState: dict[str, str]
|
|
90
|
+
|
|
91
|
+
|
|
51
92
|
@define(slots=False)
|
|
52
93
|
class AirflowRunFacet(BaseFacet):
|
|
53
94
|
"""Composite Airflow run facet."""
|
|
@@ -61,7 +102,8 @@ class AirflowRunFacet(BaseFacet):
|
|
|
61
102
|
|
|
62
103
|
@define(slots=False)
|
|
63
104
|
class UnknownOperatorInstance(RedactMixin):
|
|
64
|
-
"""
|
|
105
|
+
"""
|
|
106
|
+
Describes an unknown operator.
|
|
65
107
|
|
|
66
108
|
This specifies the (class) name of the operator and its properties.
|
|
67
109
|
"""
|
|
@@ -17,18 +17,23 @@
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
19
|
import logging
|
|
20
|
+
import os
|
|
20
21
|
from concurrent.futures import ProcessPoolExecutor
|
|
21
22
|
from datetime import datetime
|
|
22
23
|
from typing import TYPE_CHECKING
|
|
23
24
|
|
|
25
|
+
import psutil
|
|
24
26
|
from openlineage.client.serde import Serde
|
|
27
|
+
from packaging.version import Version
|
|
28
|
+
from setproctitle import getproctitle, setproctitle
|
|
25
29
|
|
|
26
|
-
from airflow import __version__ as
|
|
30
|
+
from airflow import __version__ as AIRFLOW_VERSION, settings
|
|
27
31
|
from airflow.listeners import hookimpl
|
|
28
32
|
from airflow.providers.openlineage import conf
|
|
29
33
|
from airflow.providers.openlineage.extractors import ExtractorManager
|
|
30
34
|
from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter, RunState
|
|
31
35
|
from airflow.providers.openlineage.utils.utils import (
|
|
36
|
+
get_airflow_job_facet,
|
|
32
37
|
get_airflow_run_facet,
|
|
33
38
|
get_custom_facets,
|
|
34
39
|
get_job_name,
|
|
@@ -36,6 +41,7 @@ from airflow.providers.openlineage.utils.utils import (
|
|
|
36
41
|
is_selective_lineage_enabled,
|
|
37
42
|
print_warning,
|
|
38
43
|
)
|
|
44
|
+
from airflow.settings import configure_orm
|
|
39
45
|
from airflow.stats import Stats
|
|
40
46
|
from airflow.utils.timeout import timeout
|
|
41
47
|
|
|
@@ -43,18 +49,29 @@ if TYPE_CHECKING:
|
|
|
43
49
|
from sqlalchemy.orm import Session
|
|
44
50
|
|
|
45
51
|
from airflow.models import DagRun, TaskInstance
|
|
52
|
+
from airflow.utils.state import TaskInstanceState
|
|
46
53
|
|
|
47
54
|
_openlineage_listener: OpenLineageListener | None = None
|
|
55
|
+
_IS_AIRFLOW_2_10_OR_HIGHER = Version(Version(AIRFLOW_VERSION).base_version) >= Version("2.10.0")
|
|
48
56
|
|
|
49
57
|
|
|
50
58
|
def _get_try_number_success(val):
|
|
51
59
|
# todo: remove when min airflow version >= 2.10.0
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if parse(parse(airflow_version).base_version) < parse("2.10.0"):
|
|
55
|
-
return val.try_number - 1
|
|
56
|
-
else:
|
|
60
|
+
if _IS_AIRFLOW_2_10_OR_HIGHER:
|
|
57
61
|
return val.try_number
|
|
62
|
+
return val.try_number - 1
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _executor_initializer():
|
|
66
|
+
"""
|
|
67
|
+
Initialize worker processes for the executor used for DagRun listener.
|
|
68
|
+
|
|
69
|
+
This function must be picklable, so it cannot be defined as an inner method or local function.
|
|
70
|
+
|
|
71
|
+
Reconfigures the ORM engine to prevent issues that arise when multiple processes interact with
|
|
72
|
+
the Airflow database.
|
|
73
|
+
"""
|
|
74
|
+
settings.configure_orm()
|
|
58
75
|
|
|
59
76
|
|
|
60
77
|
class OpenLineageListener:
|
|
@@ -69,10 +86,10 @@ class OpenLineageListener:
|
|
|
69
86
|
@hookimpl
|
|
70
87
|
def on_task_instance_running(
|
|
71
88
|
self,
|
|
72
|
-
previous_state,
|
|
89
|
+
previous_state: TaskInstanceState,
|
|
73
90
|
task_instance: TaskInstance,
|
|
74
91
|
session: Session, # This will always be QUEUED
|
|
75
|
-
):
|
|
92
|
+
) -> None:
|
|
76
93
|
if not getattr(task_instance, "task", None) is not None:
|
|
77
94
|
self.log.warning(
|
|
78
95
|
"No task set for TI object task_id: %s - dag_id: %s - run_id %s",
|
|
@@ -111,13 +128,16 @@ class OpenLineageListener:
|
|
|
111
128
|
# we return here because Airflow 2.3 needs task from deferred state
|
|
112
129
|
if task_instance.next_method is not None:
|
|
113
130
|
return
|
|
114
|
-
parent_run_id = self.adapter.build_dag_run_id(
|
|
131
|
+
parent_run_id = self.adapter.build_dag_run_id(
|
|
132
|
+
dag_id=dag.dag_id,
|
|
133
|
+
execution_date=dagrun.execution_date,
|
|
134
|
+
)
|
|
115
135
|
|
|
116
136
|
task_uuid = self.adapter.build_task_instance_run_id(
|
|
117
137
|
dag_id=dag.dag_id,
|
|
118
138
|
task_id=task.task_id,
|
|
119
|
-
execution_date=task_instance.execution_date,
|
|
120
139
|
try_number=task_instance.try_number,
|
|
140
|
+
execution_date=task_instance.execution_date,
|
|
121
141
|
)
|
|
122
142
|
event_type = RunState.RUNNING.value.lower()
|
|
123
143
|
operator_name = task.task_type.lower()
|
|
@@ -130,7 +150,6 @@ class OpenLineageListener:
|
|
|
130
150
|
dagrun.data_interval_start.isoformat() if dagrun.data_interval_start else None
|
|
131
151
|
)
|
|
132
152
|
data_interval_end = dagrun.data_interval_end.isoformat() if dagrun.data_interval_end else None
|
|
133
|
-
|
|
134
153
|
redacted_event = self.adapter.start_task(
|
|
135
154
|
run_id=task_uuid,
|
|
136
155
|
job_name=get_job_name(task),
|
|
@@ -153,10 +172,12 @@ class OpenLineageListener:
|
|
|
153
172
|
len(Serde.to_json(redacted_event).encode("utf-8")),
|
|
154
173
|
)
|
|
155
174
|
|
|
156
|
-
on_running
|
|
175
|
+
self._execute(on_running, "on_running", use_fork=True)
|
|
157
176
|
|
|
158
177
|
@hookimpl
|
|
159
|
-
def on_task_instance_success(
|
|
178
|
+
def on_task_instance_success(
|
|
179
|
+
self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
|
|
180
|
+
) -> None:
|
|
160
181
|
self.log.debug("OpenLineage listener got notification about task instance success")
|
|
161
182
|
|
|
162
183
|
dagrun = task_instance.dag_run
|
|
@@ -184,13 +205,16 @@ class OpenLineageListener:
|
|
|
184
205
|
|
|
185
206
|
@print_warning(self.log)
|
|
186
207
|
def on_success():
|
|
187
|
-
parent_run_id = OpenLineageAdapter.build_dag_run_id(
|
|
208
|
+
parent_run_id = OpenLineageAdapter.build_dag_run_id(
|
|
209
|
+
dag_id=dag.dag_id,
|
|
210
|
+
execution_date=dagrun.execution_date,
|
|
211
|
+
)
|
|
188
212
|
|
|
189
213
|
task_uuid = OpenLineageAdapter.build_task_instance_run_id(
|
|
190
214
|
dag_id=dag.dag_id,
|
|
191
215
|
task_id=task.task_id,
|
|
192
|
-
execution_date=task_instance.execution_date,
|
|
193
216
|
try_number=_get_try_number_success(task_instance),
|
|
217
|
+
execution_date=task_instance.execution_date,
|
|
194
218
|
)
|
|
195
219
|
event_type = RunState.COMPLETE.value.lower()
|
|
196
220
|
operator_name = task.task_type.lower()
|
|
@@ -215,10 +239,39 @@ class OpenLineageListener:
|
|
|
215
239
|
len(Serde.to_json(redacted_event).encode("utf-8")),
|
|
216
240
|
)
|
|
217
241
|
|
|
218
|
-
on_success
|
|
242
|
+
self._execute(on_success, "on_success", use_fork=True)
|
|
219
243
|
|
|
220
|
-
|
|
221
|
-
|
|
244
|
+
if _IS_AIRFLOW_2_10_OR_HIGHER:
|
|
245
|
+
|
|
246
|
+
@hookimpl
|
|
247
|
+
def on_task_instance_failed(
|
|
248
|
+
self,
|
|
249
|
+
previous_state: TaskInstanceState,
|
|
250
|
+
task_instance: TaskInstance,
|
|
251
|
+
error: None | str | BaseException,
|
|
252
|
+
session: Session,
|
|
253
|
+
) -> None:
|
|
254
|
+
self._on_task_instance_failed(
|
|
255
|
+
previous_state=previous_state, task_instance=task_instance, error=error, session=session
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
else:
|
|
259
|
+
|
|
260
|
+
@hookimpl
|
|
261
|
+
def on_task_instance_failed(
|
|
262
|
+
self, previous_state: TaskInstanceState, task_instance: TaskInstance, session: Session
|
|
263
|
+
) -> None:
|
|
264
|
+
self._on_task_instance_failed(
|
|
265
|
+
previous_state=previous_state, task_instance=task_instance, error=None, session=session
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def _on_task_instance_failed(
|
|
269
|
+
self,
|
|
270
|
+
previous_state: TaskInstanceState,
|
|
271
|
+
task_instance: TaskInstance,
|
|
272
|
+
session: Session,
|
|
273
|
+
error: None | str | BaseException = None,
|
|
274
|
+
) -> None:
|
|
222
275
|
self.log.debug("OpenLineage listener got notification about task instance failure")
|
|
223
276
|
|
|
224
277
|
dagrun = task_instance.dag_run
|
|
@@ -246,13 +299,16 @@ class OpenLineageListener:
|
|
|
246
299
|
|
|
247
300
|
@print_warning(self.log)
|
|
248
301
|
def on_failure():
|
|
249
|
-
parent_run_id = OpenLineageAdapter.build_dag_run_id(
|
|
302
|
+
parent_run_id = OpenLineageAdapter.build_dag_run_id(
|
|
303
|
+
dag_id=dag.dag_id,
|
|
304
|
+
execution_date=dagrun.execution_date,
|
|
305
|
+
)
|
|
250
306
|
|
|
251
307
|
task_uuid = OpenLineageAdapter.build_task_instance_run_id(
|
|
252
308
|
dag_id=dag.dag_id,
|
|
253
309
|
task_id=task.task_id,
|
|
254
|
-
execution_date=task_instance.execution_date,
|
|
255
310
|
try_number=task_instance.try_number,
|
|
311
|
+
execution_date=task_instance.execution_date,
|
|
256
312
|
)
|
|
257
313
|
event_type = RunState.FAIL.value.lower()
|
|
258
314
|
operator_name = task.task_type.lower()
|
|
@@ -271,40 +327,76 @@ class OpenLineageListener:
|
|
|
271
327
|
parent_run_id=parent_run_id,
|
|
272
328
|
end_time=end_date.isoformat(),
|
|
273
329
|
task=task_metadata,
|
|
330
|
+
error=error,
|
|
274
331
|
)
|
|
275
332
|
Stats.gauge(
|
|
276
333
|
f"ol.event.size.{event_type}.{operator_name}",
|
|
277
334
|
len(Serde.to_json(redacted_event).encode("utf-8")),
|
|
278
335
|
)
|
|
279
336
|
|
|
280
|
-
on_failure
|
|
337
|
+
self._execute(on_failure, "on_failure", use_fork=True)
|
|
338
|
+
|
|
339
|
+
def _execute(self, callable, callable_name: str, use_fork: bool = False):
|
|
340
|
+
if use_fork:
|
|
341
|
+
self._fork_execute(callable, callable_name)
|
|
342
|
+
else:
|
|
343
|
+
callable()
|
|
344
|
+
|
|
345
|
+
def _terminate_with_wait(self, process: psutil.Process):
|
|
346
|
+
process.terminate()
|
|
347
|
+
try:
|
|
348
|
+
# Waiting for max 3 seconds to make sure process can clean up before being killed.
|
|
349
|
+
process.wait(timeout=3)
|
|
350
|
+
except psutil.TimeoutExpired:
|
|
351
|
+
# If it's not dead by then, then force kill.
|
|
352
|
+
process.kill()
|
|
353
|
+
|
|
354
|
+
def _fork_execute(self, callable, callable_name: str):
|
|
355
|
+
self.log.debug("Will fork to execute OpenLineage process.")
|
|
356
|
+
pid = os.fork()
|
|
357
|
+
if pid:
|
|
358
|
+
process = psutil.Process(pid)
|
|
359
|
+
try:
|
|
360
|
+
self.log.debug("Waiting for process %s", pid)
|
|
361
|
+
process.wait(conf.execution_timeout())
|
|
362
|
+
except psutil.TimeoutExpired:
|
|
363
|
+
self.log.warning(
|
|
364
|
+
"OpenLineage process %s expired. This should not affect process execution.", pid
|
|
365
|
+
)
|
|
366
|
+
self._terminate_with_wait(process)
|
|
367
|
+
except BaseException:
|
|
368
|
+
# Kill the process directly.
|
|
369
|
+
self._terminate_with_wait(process)
|
|
370
|
+
self.log.debug("Process with pid %s finished - parent", pid)
|
|
371
|
+
else:
|
|
372
|
+
setproctitle(getproctitle() + " - OpenLineage - " + callable_name)
|
|
373
|
+
configure_orm(disable_connection_pool=True)
|
|
374
|
+
self.log.debug("Executing OpenLineage process - %s - pid %s", callable_name, os.getpid())
|
|
375
|
+
callable()
|
|
376
|
+
self.log.debug("Process with current pid finishes after %s", callable_name)
|
|
377
|
+
os._exit(0)
|
|
281
378
|
|
|
282
379
|
@property
|
|
283
|
-
def executor(self):
|
|
284
|
-
def initializer():
|
|
285
|
-
# Re-configure the ORM engine as there are issues with multiple processes
|
|
286
|
-
# if process calls Airflow DB.
|
|
287
|
-
settings.configure_orm()
|
|
288
|
-
|
|
380
|
+
def executor(self) -> ProcessPoolExecutor:
|
|
289
381
|
if not self._executor:
|
|
290
382
|
self._executor = ProcessPoolExecutor(
|
|
291
383
|
max_workers=conf.dag_state_change_process_pool_size(),
|
|
292
|
-
initializer=
|
|
384
|
+
initializer=_executor_initializer,
|
|
293
385
|
)
|
|
294
386
|
return self._executor
|
|
295
387
|
|
|
296
388
|
@hookimpl
|
|
297
|
-
def on_starting(self, component):
|
|
389
|
+
def on_starting(self, component) -> None:
|
|
298
390
|
self.log.debug("on_starting: %s", component.__class__.__name__)
|
|
299
391
|
|
|
300
392
|
@hookimpl
|
|
301
|
-
def before_stopping(self, component):
|
|
393
|
+
def before_stopping(self, component) -> None:
|
|
302
394
|
self.log.debug("before_stopping: %s", component.__class__.__name__)
|
|
303
395
|
with timeout(30):
|
|
304
396
|
self.executor.shutdown(wait=True)
|
|
305
397
|
|
|
306
398
|
@hookimpl
|
|
307
|
-
def on_dag_run_running(self, dag_run: DagRun, msg: str):
|
|
399
|
+
def on_dag_run_running(self, dag_run: DagRun, msg: str) -> None:
|
|
308
400
|
if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
|
|
309
401
|
self.log.debug(
|
|
310
402
|
"Skipping OpenLineage event emission for DAG `%s` "
|
|
@@ -326,10 +418,13 @@ class OpenLineageListener:
|
|
|
326
418
|
msg=msg,
|
|
327
419
|
nominal_start_time=data_interval_start,
|
|
328
420
|
nominal_end_time=data_interval_end,
|
|
421
|
+
# AirflowJobFacet should be created outside ProcessPoolExecutor that pickles objects,
|
|
422
|
+
# as it causes lack of some TaskGroup attributes and crashes event emission.
|
|
423
|
+
job_facets={**get_airflow_job_facet(dag_run=dag_run)},
|
|
329
424
|
)
|
|
330
425
|
|
|
331
426
|
@hookimpl
|
|
332
|
-
def on_dag_run_success(self, dag_run: DagRun, msg: str):
|
|
427
|
+
def on_dag_run_success(self, dag_run: DagRun, msg: str) -> None:
|
|
333
428
|
if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
|
|
334
429
|
self.log.debug(
|
|
335
430
|
"Skipping OpenLineage event emission for DAG `%s` "
|
|
@@ -346,7 +441,7 @@ class OpenLineageListener:
|
|
|
346
441
|
self.executor.submit(self.adapter.dag_success, dag_run=dag_run, msg=msg)
|
|
347
442
|
|
|
348
443
|
@hookimpl
|
|
349
|
-
def on_dag_run_failed(self, dag_run: DagRun, msg: str):
|
|
444
|
+
def on_dag_run_failed(self, dag_run: DagRun, msg: str) -> None:
|
|
350
445
|
if dag_run.dag and not is_selective_lineage_enabled(dag_run.dag):
|
|
351
446
|
self.log.debug(
|
|
352
447
|
"Skipping OpenLineage event emission for DAG `%s` "
|
|
@@ -61,8 +61,8 @@ def lineage_run_id(task_instance: TaskInstance):
|
|
|
61
61
|
return OpenLineageAdapter.build_task_instance_run_id(
|
|
62
62
|
dag_id=task_instance.dag_id,
|
|
63
63
|
task_id=task_instance.task_id,
|
|
64
|
-
execution_date=task_instance.execution_date,
|
|
65
64
|
try_number=task_instance.try_number,
|
|
65
|
+
execution_date=task_instance.execution_date,
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
|
|
@@ -39,6 +39,7 @@ from airflow.providers.openlineage.utils.sql import (
|
|
|
39
39
|
get_table_schemas,
|
|
40
40
|
)
|
|
41
41
|
from airflow.typing_compat import TypedDict
|
|
42
|
+
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
42
43
|
|
|
43
44
|
if TYPE_CHECKING:
|
|
44
45
|
from sqlalchemy.engine import Engine
|
|
@@ -116,19 +117,27 @@ def from_table_meta(
|
|
|
116
117
|
return Dataset(namespace=namespace, name=name if not is_uppercase else name.upper())
|
|
117
118
|
|
|
118
119
|
|
|
119
|
-
class SQLParser:
|
|
120
|
-
"""
|
|
120
|
+
class SQLParser(LoggingMixin):
|
|
121
|
+
"""
|
|
122
|
+
Interface for openlineage-sql.
|
|
121
123
|
|
|
122
124
|
:param dialect: dialect specific to the database
|
|
123
125
|
:param default_schema: schema applied to each table with no schema parsed
|
|
124
126
|
"""
|
|
125
127
|
|
|
126
128
|
def __init__(self, dialect: str | None = None, default_schema: str | None = None) -> None:
|
|
129
|
+
super().__init__()
|
|
127
130
|
self.dialect = dialect
|
|
128
131
|
self.default_schema = default_schema
|
|
129
132
|
|
|
130
133
|
def parse(self, sql: list[str] | str) -> SqlMeta | None:
|
|
131
134
|
"""Parse a single or a list of SQL statements."""
|
|
135
|
+
self.log.debug(
|
|
136
|
+
"OpenLineage calling SQL parser with SQL %s dialect %s schema %s",
|
|
137
|
+
sql,
|
|
138
|
+
self.dialect,
|
|
139
|
+
self.default_schema,
|
|
140
|
+
)
|
|
132
141
|
return parse(sql=sql, dialect=self.dialect, default_schema=self.default_schema)
|
|
133
142
|
|
|
134
143
|
def parse_table_schemas(
|
|
@@ -151,6 +160,7 @@ class SQLParser:
|
|
|
151
160
|
"database": database or database_info.database,
|
|
152
161
|
"use_flat_cross_db_query": database_info.use_flat_cross_db_query,
|
|
153
162
|
}
|
|
163
|
+
self.log.info("PRE getting schemas for input and output tables")
|
|
154
164
|
return get_table_schemas(
|
|
155
165
|
hook,
|
|
156
166
|
namespace,
|
|
@@ -235,7 +245,8 @@ class SQLParser:
|
|
|
235
245
|
sqlalchemy_engine: Engine | None = None,
|
|
236
246
|
use_connection: bool = True,
|
|
237
247
|
) -> OperatorLineage:
|
|
238
|
-
"""
|
|
248
|
+
"""
|
|
249
|
+
Parse SQL statement(s) and generate OpenLineage metadata.
|
|
239
250
|
|
|
240
251
|
Generated OpenLineage metadata contains:
|
|
241
252
|
|
|
@@ -335,9 +346,8 @@ class SQLParser:
|
|
|
335
346
|
return split_statement(sql)
|
|
336
347
|
return [obj for stmt in sql for obj in cls.split_sql_string(stmt) if obj != ""]
|
|
337
348
|
|
|
338
|
-
@classmethod
|
|
339
349
|
def create_information_schema_query(
|
|
340
|
-
|
|
350
|
+
self,
|
|
341
351
|
tables: list[DbTableMeta],
|
|
342
352
|
normalize_name: Callable[[str], str],
|
|
343
353
|
is_cross_db: bool,
|
|
@@ -349,7 +359,7 @@ class SQLParser:
|
|
|
349
359
|
sqlalchemy_engine: Engine | None = None,
|
|
350
360
|
) -> str:
|
|
351
361
|
"""Create SELECT statement to query information schema table."""
|
|
352
|
-
tables_hierarchy =
|
|
362
|
+
tables_hierarchy = self._get_tables_hierarchy(
|
|
353
363
|
tables,
|
|
354
364
|
normalize_name=normalize_name,
|
|
355
365
|
database=database,
|
|
@@ -32,7 +32,8 @@ log = logging.getLogger(__name__)
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def enable_lineage(obj: T) -> T:
|
|
35
|
-
"""
|
|
35
|
+
"""
|
|
36
|
+
Set selective enable OpenLineage parameter to True.
|
|
36
37
|
|
|
37
38
|
The method also propagates param to tasks if the object is DAG.
|
|
38
39
|
"""
|
|
@@ -48,7 +49,8 @@ def enable_lineage(obj: T) -> T:
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def disable_lineage(obj: T) -> T:
|
|
51
|
-
"""
|
|
52
|
+
"""
|
|
53
|
+
Set selective enable OpenLineage parameter to False.
|
|
52
54
|
|
|
53
55
|
The method also propagates param to tasks if the object is DAG.
|
|
54
56
|
"""
|
|
@@ -73,7 +75,8 @@ def is_task_lineage_enabled(task: Operator) -> bool:
|
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
def is_dag_lineage_enabled(dag: DAG) -> bool:
|
|
76
|
-
"""
|
|
78
|
+
"""
|
|
79
|
+
Check if DAG is selectively enabled to emit OpenLineage events.
|
|
77
80
|
|
|
78
81
|
The method also checks if selective enable parameter is set to True
|
|
79
82
|
or if any of the tasks in DAG is selectively enabled.
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
|
+
import logging
|
|
19
20
|
from collections import defaultdict
|
|
20
21
|
from contextlib import closing
|
|
21
22
|
from enum import IntEnum
|
|
@@ -33,6 +34,9 @@ if TYPE_CHECKING:
|
|
|
33
34
|
from airflow.hooks.base import BaseHook
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
log = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
36
40
|
class ColumnIndex(IntEnum):
|
|
37
41
|
"""Enumerates the indices of columns in information schema view."""
|
|
38
42
|
|
|
@@ -81,7 +85,8 @@ def get_table_schemas(
|
|
|
81
85
|
in_query: str | None,
|
|
82
86
|
out_query: str | None,
|
|
83
87
|
) -> tuple[list[Dataset], list[Dataset]]:
|
|
84
|
-
"""
|
|
88
|
+
"""
|
|
89
|
+
Query database for table schemas.
|
|
85
90
|
|
|
86
91
|
Uses provided hook. Responsibility to provide queries for this function is on particular extractors.
|
|
87
92
|
If query for input or output table isn't provided, the query is skipped.
|
|
@@ -90,6 +95,7 @@ def get_table_schemas(
|
|
|
90
95
|
if not in_query and not out_query:
|
|
91
96
|
return [], []
|
|
92
97
|
|
|
98
|
+
log.debug("Starting to query database for table schemas")
|
|
93
99
|
with closing(hook.get_conn()) as conn, closing(conn.cursor()) as cursor:
|
|
94
100
|
if in_query:
|
|
95
101
|
cursor.execute(in_query)
|
|
@@ -101,11 +107,13 @@ def get_table_schemas(
|
|
|
101
107
|
out_datasets = [x.to_dataset(namespace, database, schema) for x in parse_query_result(cursor)]
|
|
102
108
|
else:
|
|
103
109
|
out_datasets = []
|
|
110
|
+
log.debug("Got table schema query result from database.")
|
|
104
111
|
return in_datasets, out_datasets
|
|
105
112
|
|
|
106
113
|
|
|
107
114
|
def parse_query_result(cursor) -> list[TableSchema]:
|
|
108
|
-
"""
|
|
115
|
+
"""
|
|
116
|
+
Fetch results from DB-API 2.0 cursor and creates list of table schemas.
|
|
109
117
|
|
|
110
118
|
For each row it creates :class:`TableSchema`.
|
|
111
119
|
"""
|
|
@@ -149,7 +157,7 @@ def create_information_schema_query(
|
|
|
149
157
|
sqlalchemy_engine: Engine | None = None,
|
|
150
158
|
) -> str:
|
|
151
159
|
"""Create query for getting table schemas from information schema."""
|
|
152
|
-
metadata = MetaData(
|
|
160
|
+
metadata = MetaData()
|
|
153
161
|
select_statements = []
|
|
154
162
|
# Don't iterate over tables hierarchy, just pass it to query single information schema table
|
|
155
163
|
if use_flat_cross_db_query:
|