apache-airflow-providers-openlineage 1.9.1rc1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.
- airflow/providers/openlineage/__init__.py +1 -1
- airflow/providers/openlineage/conf.py +31 -31
- airflow/providers/openlineage/extractors/base.py +25 -10
- airflow/providers/openlineage/extractors/bash.py +3 -3
- airflow/providers/openlineage/extractors/manager.py +18 -15
- airflow/providers/openlineage/extractors/python.py +3 -3
- airflow/providers/openlineage/facets/AirflowDagRunFacet.json +105 -0
- airflow/providers/openlineage/facets/AirflowRunFacet.json +4 -0
- airflow/providers/openlineage/get_provider_info.py +16 -1
- airflow/providers/openlineage/plugins/adapter.py +67 -51
- airflow/providers/openlineage/plugins/facets.py +20 -12
- airflow/providers/openlineage/plugins/listener.py +22 -13
- airflow/providers/openlineage/sqlparser.py +12 -19
- airflow/providers/openlineage/utils/sql.py +5 -5
- airflow/providers/openlineage/utils/utils.py +132 -15
- {apache_airflow_providers_openlineage-1.9.1rc1.dist-info → apache_airflow_providers_openlineage-1.10.0.dist-info}/METADATA +8 -8
- apache_airflow_providers_openlineage-1.10.0.dist-info/RECORD +29 -0
- apache_airflow_providers_openlineage-1.9.1rc1.dist-info/RECORD +0 -28
- {apache_airflow_providers_openlineage-1.9.1rc1.dist-info → apache_airflow_providers_openlineage-1.10.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_openlineage-1.9.1rc1.dist-info → apache_airflow_providers_openlineage-1.10.0.dist-info}/entry_points.txt +0 -0
|
@@ -22,24 +22,25 @@ from typing import TYPE_CHECKING
|
|
|
22
22
|
|
|
23
23
|
import yaml
|
|
24
24
|
from openlineage.client import OpenLineageClient, set_producer
|
|
25
|
-
from openlineage.client.
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
25
|
+
from openlineage.client.event_v2 import Job, Run, RunEvent, RunState
|
|
26
|
+
from openlineage.client.facet_v2 import (
|
|
27
|
+
JobFacet,
|
|
28
|
+
RunFacet,
|
|
29
|
+
documentation_job,
|
|
30
|
+
error_message_run,
|
|
31
|
+
job_type_job,
|
|
32
|
+
nominal_time_run,
|
|
33
|
+
ownership_job,
|
|
34
|
+
parent_run,
|
|
35
|
+
processing_engine_run,
|
|
36
|
+
source_code_location_job,
|
|
36
37
|
)
|
|
37
|
-
from openlineage.client.run import Job, Run, RunEvent, RunState
|
|
38
38
|
from openlineage.client.uuid import generate_static_uuid
|
|
39
39
|
|
|
40
40
|
from airflow.providers.openlineage import __version__ as OPENLINEAGE_PROVIDER_VERSION, conf
|
|
41
41
|
from airflow.providers.openlineage.utils.utils import (
|
|
42
42
|
OpenLineageRedactor,
|
|
43
|
+
get_airflow_dag_run_facet,
|
|
43
44
|
get_airflow_state_run_facet,
|
|
44
45
|
)
|
|
45
46
|
from airflow.stats import Stats
|
|
@@ -59,8 +60,8 @@ set_producer(_PRODUCER)
|
|
|
59
60
|
# https://openlineage.io/docs/spec/facets/job-facets/job-type
|
|
60
61
|
# They must be set after the `set_producer(_PRODUCER)`
|
|
61
62
|
# otherwise the `JobTypeJobFacet._producer` will be set with the default value
|
|
62
|
-
_JOB_TYPE_DAG = JobTypeJobFacet(jobType="DAG", integration="AIRFLOW", processingType="BATCH")
|
|
63
|
-
_JOB_TYPE_TASK = JobTypeJobFacet(jobType="TASK", integration="AIRFLOW", processingType="BATCH")
|
|
63
|
+
_JOB_TYPE_DAG = job_type_job.JobTypeJobFacet(jobType="DAG", integration="AIRFLOW", processingType="BATCH")
|
|
64
|
+
_JOB_TYPE_TASK = job_type_job.JobTypeJobFacet(jobType="TASK", integration="AIRFLOW", processingType="BATCH")
|
|
64
65
|
|
|
65
66
|
|
|
66
67
|
class OpenLineageAdapter(LoggingMixin):
|
|
@@ -148,7 +149,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
148
149
|
if not self._client:
|
|
149
150
|
self._client = self.get_or_create_openlineage_client()
|
|
150
151
|
redacted_event: RunEvent = self._redacter.redact(event, max_depth=20) # type: ignore[assignment]
|
|
151
|
-
event_type = event.eventType.value.lower()
|
|
152
|
+
event_type = event.eventType.value.lower() if event.eventType else ""
|
|
152
153
|
transport_type = f"{self._client.transport.kind}".lower()
|
|
153
154
|
|
|
154
155
|
try:
|
|
@@ -177,7 +178,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
177
178
|
nominal_end_time: str | None,
|
|
178
179
|
owners: list[str],
|
|
179
180
|
task: OperatorLineage | None,
|
|
180
|
-
run_facets: dict[str,
|
|
181
|
+
run_facets: dict[str, RunFacet] | None = None,
|
|
181
182
|
) -> RunEvent:
|
|
182
183
|
"""
|
|
183
184
|
Emit openlineage event of type START.
|
|
@@ -198,14 +199,13 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
198
199
|
"""
|
|
199
200
|
from airflow.version import version as AIRFLOW_VERSION
|
|
200
201
|
|
|
201
|
-
processing_engine_version_facet = ProcessingEngineRunFacet(
|
|
202
|
+
processing_engine_version_facet = processing_engine_run.ProcessingEngineRunFacet(
|
|
202
203
|
version=AIRFLOW_VERSION,
|
|
203
204
|
name="Airflow",
|
|
204
205
|
openlineageAdapterVersion=OPENLINEAGE_PROVIDER_VERSION,
|
|
205
206
|
)
|
|
206
207
|
|
|
207
|
-
|
|
208
|
-
run_facets = {}
|
|
208
|
+
run_facets = run_facets or {}
|
|
209
209
|
if task:
|
|
210
210
|
run_facets = {**task.run_facets, **run_facets}
|
|
211
211
|
run_facets["processing_engine"] = processing_engine_version_facet # type: ignore
|
|
@@ -243,6 +243,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
243
243
|
parent_run_id: str | None,
|
|
244
244
|
end_time: str,
|
|
245
245
|
task: OperatorLineage,
|
|
246
|
+
run_facets: dict[str, RunFacet] | None = None,
|
|
246
247
|
) -> RunEvent:
|
|
247
248
|
"""
|
|
248
249
|
Emit openlineage event of type COMPLETE.
|
|
@@ -254,7 +255,11 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
254
255
|
:param parent_run_id: identifier of job spawning this task
|
|
255
256
|
:param end_time: time of task completion
|
|
256
257
|
:param task: metadata container with information extracted from operator
|
|
258
|
+
:param run_facets: additional run facets
|
|
257
259
|
"""
|
|
260
|
+
run_facets = run_facets or {}
|
|
261
|
+
if task:
|
|
262
|
+
run_facets = {**task.run_facets, **run_facets}
|
|
258
263
|
event = RunEvent(
|
|
259
264
|
eventType=RunState.COMPLETE,
|
|
260
265
|
eventTime=end_time,
|
|
@@ -263,7 +268,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
263
268
|
job_name=job_name,
|
|
264
269
|
parent_job_name=parent_job_name,
|
|
265
270
|
parent_run_id=parent_run_id,
|
|
266
|
-
run_facets=
|
|
271
|
+
run_facets=run_facets,
|
|
267
272
|
),
|
|
268
273
|
job=self._build_job(job_name, job_type=_JOB_TYPE_TASK, job_facets=task.job_facets),
|
|
269
274
|
inputs=task.inputs,
|
|
@@ -281,6 +286,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
281
286
|
end_time: str,
|
|
282
287
|
task: OperatorLineage,
|
|
283
288
|
error: str | BaseException | None = None,
|
|
289
|
+
run_facets: dict[str, RunFacet] | None = None,
|
|
284
290
|
) -> RunEvent:
|
|
285
291
|
"""
|
|
286
292
|
Emit openlineage event of type FAIL.
|
|
@@ -292,21 +298,23 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
292
298
|
:param parent_run_id: identifier of job spawning this task
|
|
293
299
|
:param end_time: time of task completion
|
|
294
300
|
:param task: metadata container with information extracted from operator
|
|
301
|
+
:param run_facets: custom run facets
|
|
295
302
|
:param error: error
|
|
303
|
+
:param run_facets: additional run facets
|
|
296
304
|
"""
|
|
297
|
-
|
|
305
|
+
run_facets = run_facets or {}
|
|
306
|
+
if task:
|
|
307
|
+
run_facets = {**task.run_facets, **run_facets}
|
|
308
|
+
|
|
298
309
|
if error:
|
|
299
310
|
stack_trace = None
|
|
300
311
|
if isinstance(error, BaseException) and error.__traceback__:
|
|
301
312
|
import traceback
|
|
302
313
|
|
|
303
314
|
stack_trace = "\\n".join(traceback.format_exception(type(error), error, error.__traceback__))
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
message=str(error), programmingLanguage="python", stackTrace=stack_trace
|
|
308
|
-
)
|
|
309
|
-
}
|
|
315
|
+
run_facets["errorMessage"] = error_message_run.ErrorMessageRunFacet(
|
|
316
|
+
message=str(error), programmingLanguage="python", stackTrace=stack_trace
|
|
317
|
+
)
|
|
310
318
|
|
|
311
319
|
event = RunEvent(
|
|
312
320
|
eventType=RunState.FAIL,
|
|
@@ -316,7 +324,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
316
324
|
job_name=job_name,
|
|
317
325
|
parent_job_name=parent_job_name,
|
|
318
326
|
parent_run_id=parent_run_id,
|
|
319
|
-
run_facets=
|
|
327
|
+
run_facets=run_facets,
|
|
320
328
|
),
|
|
321
329
|
job=self._build_job(job_name, job_type=_JOB_TYPE_TASK, job_facets=task.job_facets),
|
|
322
330
|
inputs=task.inputs,
|
|
@@ -331,9 +339,10 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
331
339
|
msg: str,
|
|
332
340
|
nominal_start_time: str,
|
|
333
341
|
nominal_end_time: str,
|
|
334
|
-
job_facets: dict[str,
|
|
342
|
+
job_facets: dict[str, JobFacet] | None = None, # Custom job facets
|
|
335
343
|
):
|
|
336
344
|
try:
|
|
345
|
+
owner = [x.strip() for x in dag_run.dag.owner.split(",")] if dag_run.dag else None
|
|
337
346
|
event = RunEvent(
|
|
338
347
|
eventType=RunState.START,
|
|
339
348
|
eventTime=dag_run.start_date.isoformat(),
|
|
@@ -341,7 +350,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
341
350
|
job_name=dag_run.dag_id,
|
|
342
351
|
job_type=_JOB_TYPE_DAG,
|
|
343
352
|
job_description=dag_run.dag.description if dag_run.dag else None,
|
|
344
|
-
owners=
|
|
353
|
+
owners=owner,
|
|
345
354
|
job_facets=job_facets,
|
|
346
355
|
),
|
|
347
356
|
run=self._build_run(
|
|
@@ -352,6 +361,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
352
361
|
job_name=dag_run.dag_id,
|
|
353
362
|
nominal_start_time=nominal_start_time,
|
|
354
363
|
nominal_end_time=nominal_end_time,
|
|
364
|
+
run_facets=get_airflow_dag_run_facet(dag_run),
|
|
355
365
|
),
|
|
356
366
|
inputs=[],
|
|
357
367
|
outputs=[],
|
|
@@ -400,7 +410,9 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
400
410
|
execution_date=dag_run.execution_date,
|
|
401
411
|
),
|
|
402
412
|
facets={
|
|
403
|
-
"errorMessage": ErrorMessageRunFacet(
|
|
413
|
+
"errorMessage": error_message_run.ErrorMessageRunFacet(
|
|
414
|
+
message=msg, programmingLanguage="python"
|
|
415
|
+
),
|
|
404
416
|
**get_airflow_state_run_facet(dag_run),
|
|
405
417
|
},
|
|
406
418
|
),
|
|
@@ -423,23 +435,19 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
423
435
|
parent_run_id: str | None = None,
|
|
424
436
|
nominal_start_time: str | None = None,
|
|
425
437
|
nominal_end_time: str | None = None,
|
|
426
|
-
run_facets: dict[str,
|
|
438
|
+
run_facets: dict[str, RunFacet] | None = None,
|
|
427
439
|
) -> Run:
|
|
428
|
-
facets: dict[str,
|
|
440
|
+
facets: dict[str, RunFacet] = {}
|
|
429
441
|
if nominal_start_time:
|
|
430
|
-
facets.update({"nominalTime": NominalTimeRunFacet(nominal_start_time, nominal_end_time)})
|
|
431
|
-
if parent_run_id:
|
|
432
|
-
parent_run_facet = ParentRunFacet.create(
|
|
433
|
-
runId=parent_run_id,
|
|
434
|
-
namespace=conf.namespace(),
|
|
435
|
-
name=parent_job_name or job_name,
|
|
436
|
-
)
|
|
437
442
|
facets.update(
|
|
438
|
-
{
|
|
439
|
-
"parent": parent_run_facet,
|
|
440
|
-
"parentRun": parent_run_facet, # Keep sending this for the backward compatibility
|
|
441
|
-
}
|
|
443
|
+
{"nominalTime": nominal_time_run.NominalTimeRunFacet(nominal_start_time, nominal_end_time)}
|
|
442
444
|
)
|
|
445
|
+
if parent_run_id:
|
|
446
|
+
parent_run_facet = parent_run.ParentRunFacet(
|
|
447
|
+
run=parent_run.Run(runId=parent_run_id),
|
|
448
|
+
job=parent_run.Job(namespace=conf.namespace(), name=parent_job_name or job_name),
|
|
449
|
+
)
|
|
450
|
+
facets.update({"parent": parent_run_facet})
|
|
443
451
|
|
|
444
452
|
if run_facets:
|
|
445
453
|
facets.update(run_facets)
|
|
@@ -449,23 +457,31 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
449
457
|
@staticmethod
|
|
450
458
|
def _build_job(
|
|
451
459
|
job_name: str,
|
|
452
|
-
job_type: JobTypeJobFacet,
|
|
460
|
+
job_type: job_type_job.JobTypeJobFacet,
|
|
453
461
|
job_description: str | None = None,
|
|
454
462
|
code_location: str | None = None,
|
|
455
463
|
owners: list[str] | None = None,
|
|
456
|
-
job_facets: dict[str,
|
|
464
|
+
job_facets: dict[str, JobFacet] | None = None,
|
|
457
465
|
):
|
|
458
|
-
facets: dict[str,
|
|
466
|
+
facets: dict[str, JobFacet] = {}
|
|
459
467
|
|
|
460
468
|
if job_description:
|
|
461
|
-
facets.update(
|
|
469
|
+
facets.update(
|
|
470
|
+
{"documentation": documentation_job.DocumentationJobFacet(description=job_description)}
|
|
471
|
+
)
|
|
462
472
|
if code_location:
|
|
463
|
-
facets.update(
|
|
473
|
+
facets.update(
|
|
474
|
+
{
|
|
475
|
+
"sourceCodeLocation": source_code_location_job.SourceCodeLocationJobFacet(
|
|
476
|
+
"", url=code_location
|
|
477
|
+
)
|
|
478
|
+
}
|
|
479
|
+
)
|
|
464
480
|
if owners:
|
|
465
481
|
facets.update(
|
|
466
482
|
{
|
|
467
|
-
"ownership": OwnershipJobFacet(
|
|
468
|
-
owners=[
|
|
483
|
+
"ownership": ownership_job.OwnershipJobFacet(
|
|
484
|
+
owners=[ownership_job.Owner(name=owner) for owner in owners]
|
|
469
485
|
)
|
|
470
486
|
}
|
|
471
487
|
)
|
|
@@ -18,7 +18,7 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
from attrs import define
|
|
20
20
|
from deprecated import deprecated
|
|
21
|
-
from openlineage.client.
|
|
21
|
+
from openlineage.client.facet_v2 import JobFacet, RunFacet
|
|
22
22
|
from openlineage.client.utils import RedactMixin
|
|
23
23
|
|
|
24
24
|
from airflow.exceptions import AirflowProviderDeprecationWarning
|
|
@@ -28,8 +28,8 @@ from airflow.exceptions import AirflowProviderDeprecationWarning
|
|
|
28
28
|
reason="To be removed in the next release. Make sure to use information from AirflowRunFacet instead.",
|
|
29
29
|
category=AirflowProviderDeprecationWarning,
|
|
30
30
|
)
|
|
31
|
-
@define
|
|
32
|
-
class AirflowMappedTaskRunFacet(
|
|
31
|
+
@define
|
|
32
|
+
class AirflowMappedTaskRunFacet(RunFacet):
|
|
33
33
|
"""Run facet containing information about mapped tasks."""
|
|
34
34
|
|
|
35
35
|
mapIndex: int
|
|
@@ -47,8 +47,8 @@ class AirflowMappedTaskRunFacet(BaseFacet):
|
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
@define
|
|
51
|
-
class AirflowJobFacet(
|
|
50
|
+
@define
|
|
51
|
+
class AirflowJobFacet(JobFacet):
|
|
52
52
|
"""
|
|
53
53
|
Composite Airflow job facet.
|
|
54
54
|
|
|
@@ -70,8 +70,8 @@ class AirflowJobFacet(BaseFacet):
|
|
|
70
70
|
tasks: dict
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
@define
|
|
74
|
-
class AirflowStateRunFacet(
|
|
73
|
+
@define
|
|
74
|
+
class AirflowStateRunFacet(RunFacet):
|
|
75
75
|
"""
|
|
76
76
|
Airflow facet providing state information.
|
|
77
77
|
|
|
@@ -89,8 +89,8 @@ class AirflowStateRunFacet(BaseFacet):
|
|
|
89
89
|
tasksState: dict[str, str]
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
@define
|
|
93
|
-
class AirflowRunFacet(
|
|
92
|
+
@define
|
|
93
|
+
class AirflowRunFacet(RunFacet):
|
|
94
94
|
"""Composite Airflow run facet."""
|
|
95
95
|
|
|
96
96
|
dag: dict
|
|
@@ -100,7 +100,15 @@ class AirflowRunFacet(BaseFacet):
|
|
|
100
100
|
taskUuid: str
|
|
101
101
|
|
|
102
102
|
|
|
103
|
-
@define
|
|
103
|
+
@define
|
|
104
|
+
class AirflowDagRunFacet(RunFacet):
|
|
105
|
+
"""Composite Airflow DAG run facet."""
|
|
106
|
+
|
|
107
|
+
dag: dict
|
|
108
|
+
dagRun: dict
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@define
|
|
104
112
|
class UnknownOperatorInstance(RedactMixin):
|
|
105
113
|
"""
|
|
106
114
|
Describes an unknown operator.
|
|
@@ -119,8 +127,8 @@ class UnknownOperatorInstance(RedactMixin):
|
|
|
119
127
|
reason="To be removed in the next release. Make sure to use information from AirflowRunFacet instead.",
|
|
120
128
|
category=AirflowProviderDeprecationWarning,
|
|
121
129
|
)
|
|
122
|
-
@define
|
|
123
|
-
class UnknownOperatorAttributeRunFacet(
|
|
130
|
+
@define
|
|
131
|
+
class UnknownOperatorAttributeRunFacet(RunFacet):
|
|
124
132
|
"""RunFacet that describes unknown operators in an Airflow DAG."""
|
|
125
133
|
|
|
126
134
|
unknownItems: list[UnknownOperatorInstance]
|
|
@@ -19,45 +19,45 @@ from __future__ import annotations
|
|
|
19
19
|
import logging
|
|
20
20
|
import os
|
|
21
21
|
from concurrent.futures import ProcessPoolExecutor
|
|
22
|
-
from datetime import datetime
|
|
23
22
|
from typing import TYPE_CHECKING
|
|
24
23
|
|
|
25
24
|
import psutil
|
|
26
25
|
from openlineage.client.serde import Serde
|
|
27
|
-
from packaging.version import Version
|
|
28
26
|
from setproctitle import getproctitle, setproctitle
|
|
29
27
|
|
|
30
|
-
from airflow import
|
|
28
|
+
from airflow import settings
|
|
31
29
|
from airflow.listeners import hookimpl
|
|
32
30
|
from airflow.providers.openlineage import conf
|
|
33
31
|
from airflow.providers.openlineage.extractors import ExtractorManager
|
|
34
32
|
from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter, RunState
|
|
35
33
|
from airflow.providers.openlineage.utils.utils import (
|
|
34
|
+
IS_AIRFLOW_2_10_OR_HIGHER,
|
|
36
35
|
get_airflow_job_facet,
|
|
36
|
+
get_airflow_mapped_task_facet,
|
|
37
37
|
get_airflow_run_facet,
|
|
38
|
-
get_custom_facets,
|
|
39
38
|
get_job_name,
|
|
39
|
+
get_user_provided_run_facets,
|
|
40
40
|
is_operator_disabled,
|
|
41
41
|
is_selective_lineage_enabled,
|
|
42
42
|
print_warning,
|
|
43
43
|
)
|
|
44
44
|
from airflow.settings import configure_orm
|
|
45
45
|
from airflow.stats import Stats
|
|
46
|
+
from airflow.utils import timezone
|
|
47
|
+
from airflow.utils.state import TaskInstanceState
|
|
46
48
|
from airflow.utils.timeout import timeout
|
|
47
49
|
|
|
48
50
|
if TYPE_CHECKING:
|
|
49
51
|
from sqlalchemy.orm import Session
|
|
50
52
|
|
|
51
53
|
from airflow.models import DagRun, TaskInstance
|
|
52
|
-
from airflow.utils.state import TaskInstanceState
|
|
53
54
|
|
|
54
55
|
_openlineage_listener: OpenLineageListener | None = None
|
|
55
|
-
_IS_AIRFLOW_2_10_OR_HIGHER = Version(Version(AIRFLOW_VERSION).base_version) >= Version("2.10.0")
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def _get_try_number_success(val):
|
|
59
59
|
# todo: remove when min airflow version >= 2.10.0
|
|
60
|
-
if
|
|
60
|
+
if IS_AIRFLOW_2_10_OR_HIGHER:
|
|
61
61
|
return val.try_number
|
|
62
62
|
return val.try_number - 1
|
|
63
63
|
|
|
@@ -145,7 +145,7 @@ class OpenLineageListener:
|
|
|
145
145
|
with Stats.timer(f"ol.extract.{event_type}.{operator_name}"):
|
|
146
146
|
task_metadata = self.extractor_manager.extract_metadata(dagrun, task)
|
|
147
147
|
|
|
148
|
-
start_date = task_instance.start_date if task_instance.start_date else
|
|
148
|
+
start_date = task_instance.start_date if task_instance.start_date else timezone.utcnow()
|
|
149
149
|
data_interval_start = (
|
|
150
150
|
dagrun.data_interval_start.isoformat() if dagrun.data_interval_start else None
|
|
151
151
|
)
|
|
@@ -163,7 +163,8 @@ class OpenLineageListener:
|
|
|
163
163
|
owners=dag.owner.split(", "),
|
|
164
164
|
task=task_metadata,
|
|
165
165
|
run_facets={
|
|
166
|
-
**
|
|
166
|
+
**get_user_provided_run_facets(task_instance, TaskInstanceState.RUNNING),
|
|
167
|
+
**get_airflow_mapped_task_facet(task_instance),
|
|
167
168
|
**get_airflow_run_facet(dagrun, dag, task_instance, task, task_uuid),
|
|
168
169
|
},
|
|
169
170
|
)
|
|
@@ -224,7 +225,7 @@ class OpenLineageListener:
|
|
|
224
225
|
dagrun, task, complete=True, task_instance=task_instance
|
|
225
226
|
)
|
|
226
227
|
|
|
227
|
-
end_date = task_instance.end_date if task_instance.end_date else
|
|
228
|
+
end_date = task_instance.end_date if task_instance.end_date else timezone.utcnow()
|
|
228
229
|
|
|
229
230
|
redacted_event = self.adapter.complete_task(
|
|
230
231
|
run_id=task_uuid,
|
|
@@ -233,6 +234,10 @@ class OpenLineageListener:
|
|
|
233
234
|
parent_run_id=parent_run_id,
|
|
234
235
|
end_time=end_date.isoformat(),
|
|
235
236
|
task=task_metadata,
|
|
237
|
+
run_facets={
|
|
238
|
+
**get_user_provided_run_facets(task_instance, TaskInstanceState.SUCCESS),
|
|
239
|
+
**get_airflow_run_facet(dagrun, dag, task_instance, task, task_uuid),
|
|
240
|
+
},
|
|
236
241
|
)
|
|
237
242
|
Stats.gauge(
|
|
238
243
|
f"ol.event.size.{event_type}.{operator_name}",
|
|
@@ -241,7 +246,7 @@ class OpenLineageListener:
|
|
|
241
246
|
|
|
242
247
|
self._execute(on_success, "on_success", use_fork=True)
|
|
243
248
|
|
|
244
|
-
if
|
|
249
|
+
if IS_AIRFLOW_2_10_OR_HIGHER:
|
|
245
250
|
|
|
246
251
|
@hookimpl
|
|
247
252
|
def on_task_instance_failed(
|
|
@@ -318,7 +323,7 @@ class OpenLineageListener:
|
|
|
318
323
|
dagrun, task, complete=True, task_instance=task_instance
|
|
319
324
|
)
|
|
320
325
|
|
|
321
|
-
end_date = task_instance.end_date if task_instance.end_date else
|
|
326
|
+
end_date = task_instance.end_date if task_instance.end_date else timezone.utcnow()
|
|
322
327
|
|
|
323
328
|
redacted_event = self.adapter.fail_task(
|
|
324
329
|
run_id=task_uuid,
|
|
@@ -328,6 +333,10 @@ class OpenLineageListener:
|
|
|
328
333
|
end_time=end_date.isoformat(),
|
|
329
334
|
task=task_metadata,
|
|
330
335
|
error=error,
|
|
336
|
+
run_facets={
|
|
337
|
+
**get_user_provided_run_facets(task_instance, TaskInstanceState.FAILED),
|
|
338
|
+
**get_airflow_run_facet(dagrun, dag, task_instance, task, task_uuid),
|
|
339
|
+
},
|
|
331
340
|
)
|
|
332
341
|
Stats.gauge(
|
|
333
342
|
f"ol.event.size.{event_type}.{operator_name}",
|
|
@@ -420,7 +429,7 @@ class OpenLineageListener:
|
|
|
420
429
|
nominal_end_time=data_interval_end,
|
|
421
430
|
# AirflowJobFacet should be created outside ProcessPoolExecutor that pickles objects,
|
|
422
431
|
# as it causes lack of some TaskGroup attributes and crashes event emission.
|
|
423
|
-
job_facets=
|
|
432
|
+
job_facets=get_airflow_job_facet(dag_run=dag_run),
|
|
424
433
|
)
|
|
425
434
|
|
|
426
435
|
@hookimpl
|
|
@@ -20,16 +20,8 @@ from typing import TYPE_CHECKING, Callable
|
|
|
20
20
|
|
|
21
21
|
import sqlparse
|
|
22
22
|
from attrs import define
|
|
23
|
-
from openlineage.client.
|
|
24
|
-
|
|
25
|
-
ColumnLineageDatasetFacet,
|
|
26
|
-
ColumnLineageDatasetFacetFieldsAdditional,
|
|
27
|
-
ColumnLineageDatasetFacetFieldsAdditionalInputFields,
|
|
28
|
-
ExtractionError,
|
|
29
|
-
ExtractionErrorRunFacet,
|
|
30
|
-
SqlJobFacet,
|
|
31
|
-
)
|
|
32
|
-
from openlineage.client.run import Dataset
|
|
23
|
+
from openlineage.client.event_v2 import Dataset
|
|
24
|
+
from openlineage.client.facet_v2 import column_lineage_dataset, extraction_error_run, sql_job
|
|
33
25
|
from openlineage.common.sql import DbTableMeta, SqlMeta, parse
|
|
34
26
|
|
|
35
27
|
from airflow.providers.openlineage.extractors.base import OperatorLineage
|
|
@@ -42,6 +34,7 @@ from airflow.typing_compat import TypedDict
|
|
|
42
34
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
43
35
|
|
|
44
36
|
if TYPE_CHECKING:
|
|
37
|
+
from openlineage.client.facet_v2 import JobFacet, RunFacet
|
|
45
38
|
from sqlalchemy.engine import Engine
|
|
46
39
|
|
|
47
40
|
from airflow.hooks.base import BaseHook
|
|
@@ -160,7 +153,6 @@ class SQLParser(LoggingMixin):
|
|
|
160
153
|
"database": database or database_info.database,
|
|
161
154
|
"use_flat_cross_db_query": database_info.use_flat_cross_db_query,
|
|
162
155
|
}
|
|
163
|
-
self.log.info("PRE getting schemas for input and output tables")
|
|
164
156
|
return get_table_schemas(
|
|
165
157
|
hook,
|
|
166
158
|
namespace,
|
|
@@ -207,11 +199,12 @@ class SQLParser(LoggingMixin):
|
|
|
207
199
|
if not len(parse_result.column_lineage):
|
|
208
200
|
return
|
|
209
201
|
for dataset in datasets:
|
|
210
|
-
dataset.facets
|
|
202
|
+
dataset.facets = dataset.facets or {}
|
|
203
|
+
dataset.facets["columnLineage"] = column_lineage_dataset.ColumnLineageDatasetFacet(
|
|
211
204
|
fields={
|
|
212
|
-
column_lineage.descendant.name:
|
|
205
|
+
column_lineage.descendant.name: column_lineage_dataset.Fields(
|
|
213
206
|
inputFields=[
|
|
214
|
-
|
|
207
|
+
column_lineage_dataset.InputField(
|
|
215
208
|
namespace=dataset.namespace,
|
|
216
209
|
name=".".join(
|
|
217
210
|
filter(
|
|
@@ -261,18 +254,18 @@ class SQLParser(LoggingMixin):
|
|
|
261
254
|
:param database: when passed it takes precedence over parsed database name
|
|
262
255
|
:param sqlalchemy_engine: when passed, engine's dialect is used to compile SQL queries
|
|
263
256
|
"""
|
|
264
|
-
job_facets: dict[str,
|
|
265
|
-
parse_result = self.parse(self.split_sql_string(sql))
|
|
257
|
+
job_facets: dict[str, JobFacet] = {"sql": sql_job.SQLJobFacet(query=self.normalize_sql(sql))}
|
|
258
|
+
parse_result = self.parse(sql=self.split_sql_string(sql))
|
|
266
259
|
if not parse_result:
|
|
267
260
|
return OperatorLineage(job_facets=job_facets)
|
|
268
261
|
|
|
269
|
-
run_facets: dict[str,
|
|
262
|
+
run_facets: dict[str, RunFacet] = {}
|
|
270
263
|
if parse_result.errors:
|
|
271
|
-
run_facets["extractionError"] = ExtractionErrorRunFacet(
|
|
264
|
+
run_facets["extractionError"] = extraction_error_run.ExtractionErrorRunFacet(
|
|
272
265
|
totalTasks=len(sql) if isinstance(sql, list) else 1,
|
|
273
266
|
failedTasks=len(parse_result.errors),
|
|
274
267
|
errors=[
|
|
275
|
-
|
|
268
|
+
extraction_error_run.Error(
|
|
276
269
|
errorMessage=error.message,
|
|
277
270
|
stackTrace=None,
|
|
278
271
|
task=error.origin_statement,
|
|
@@ -23,8 +23,8 @@ from enum import IntEnum
|
|
|
23
23
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
24
24
|
|
|
25
25
|
from attrs import define
|
|
26
|
-
from openlineage.client.
|
|
27
|
-
from openlineage.client.
|
|
26
|
+
from openlineage.client.event_v2 import Dataset
|
|
27
|
+
from openlineage.client.facet_v2 import schema_dataset
|
|
28
28
|
from sqlalchemy import Column, MetaData, Table, and_, or_, union_all
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
@@ -60,7 +60,7 @@ class TableSchema:
|
|
|
60
60
|
table: str
|
|
61
61
|
schema: str | None
|
|
62
62
|
database: str | None
|
|
63
|
-
fields: list[
|
|
63
|
+
fields: list[schema_dataset.SchemaDatasetFacetFields]
|
|
64
64
|
|
|
65
65
|
def to_dataset(self, namespace: str, database: str | None = None, schema: str | None = None) -> Dataset:
|
|
66
66
|
# Prefix the table name with database and schema name using
|
|
@@ -73,7 +73,7 @@ class TableSchema:
|
|
|
73
73
|
return Dataset(
|
|
74
74
|
namespace=namespace,
|
|
75
75
|
name=name,
|
|
76
|
-
facets={"schema": SchemaDatasetFacet(fields=self.fields)} if self.fields else {},
|
|
76
|
+
facets={"schema": schema_dataset.SchemaDatasetFacet(fields=self.fields)} if self.fields else {},
|
|
77
77
|
)
|
|
78
78
|
|
|
79
79
|
|
|
@@ -122,7 +122,7 @@ def parse_query_result(cursor) -> list[TableSchema]:
|
|
|
122
122
|
for row in cursor.fetchall():
|
|
123
123
|
table_schema_name: str = row[ColumnIndex.SCHEMA]
|
|
124
124
|
table_name: str = row[ColumnIndex.TABLE_NAME]
|
|
125
|
-
table_column
|
|
125
|
+
table_column = schema_dataset.SchemaDatasetFacetFields(
|
|
126
126
|
name=row[ColumnIndex.COLUMN_NAME],
|
|
127
127
|
type=row[ColumnIndex.UDT_NAME],
|
|
128
128
|
description=None,
|